1. #include <math.h> 2. #include <mpi.h> 3. #include <stdio.h> 4. #include <unistd.h> 5. #define NUM_TRAPEZOIDS 10000000000 6. __attribute__((target(mic))) inline double f(double x) { 7. return 1.00*x*x*exp(-(x-0.0)*(x-0.0)/(2.0*0.25*0.25)) 8. + 0.50*x*x*exp(-(x-0.2)*(x-0.2)/(2.0*0.50*0.50)) 9. + 0.50*x*x*exp(-(x+0.2)*(x+0.2)/(2.0*0.50*0.50)) 10. + 0.25*x*x*exp(-(x-0.4)*(x-0.4)/(2.0*1.00*1.00)) 11. + 0.25*x*x*exp(-(x+0.4)*(x+0.4)/(2.0*1.00*1.00)); 12. } 13. 14. int main (int argc, char *argv[]) { 15. int namelen, rank, size; 16. char name[MPI_MAX_PROCESSOR_NAME]; 17. double upper_bound = 5.0, lower_bound = -5.0; 18. double x0, x1, width; 19. double integral = 0; 20. double compute_time, total_time; 21. int chunk_size; 22. MPI_Init(&argc, &argv); 23. MPI_Comm_size(MPI_COMM_WORLD, &size); 24. MPI_Comm_rank(MPI_COMM_WORLD, &rank); 25. MPI_Get_processor_name(name, &namelen); 26. chunk_size = NUM_TRAPEZOIDS / size; 27. x0 = lower_bound+(upper_bound-lower_bound)*rank/size; 28. x1 = x0 + (upper_bound - lower_bound)/size; 29. width = (x1-x0)/chunk_size; 30. MPI_Barrier(MPI_COMM_WORLD); 31. compute_time = total_time = MPI_Wtime(); 32. #pragma offload target(mic) 33. #pragma omp parallel 34. #pragma omp for reduction(+:integral) 35. for (int i = 0; i < chunk_size ; i++) { 36. integral += 0.5*width *(f(x0+width*i)+f(x0+width*(i+1))); 37. } 38. compute_time = MPI_Wtime() - compute_time; 39. MPI_Allreduce(MPI_IN_PLACE, &integral, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); 40. total_time = MPI_Wtime() - total_time; 41. printf("rank %d of %d on %s: %f seconds\n", rank, size, name, compute_time); 42. if (rank == 0) { 43. printf("integral = %f, time = %f\n", integral, total_time); 44. } 45. MPI_Finalize(); 46. return(0); 47. }