1. #include <stdio.h> 2. #include <stdlib.h> 3. #include <string.h> 4. #include <math.h> 5. #include <time.h> 6. #include <sys/time.h> 7. #include <omp.h> 8. #include <assert.h> 9. #include <sys/mman.h> 10. 11. #define REAL float 12. #define NX (256) 13. #define NXP nx 14. 15. #ifndef M_PI 16. #define M_PI (3.1415926535897932384626) 17. #endif 18. 19. 20. void init(REAL *buff, const int nx, const int ny, const int nz, 21. const REAL kx, const REAL ky, const REAL kz, 22. const REAL dx, const REAL dy, const REAL dz, 23. const REAL kappa, const REAL time) { 24. REAL ax, ay, az; 25. int jz, jy, jx; 26. ax = exp(-kappa*time*(kx*kx)); 27. ay = exp(-kappa*time*(ky*ky)); 28. az = exp(-kappa*time*(kz*kz)); 29. for (jz = 0; jz < nz; jz++) { 30. for (jy = 0; jy < ny; jy++) { 31. for (jx = 0; jx < nx; jx++) { 32. int j = jz*NXP*ny + jy*NXP + jx; 33. REAL x = dx*((REAL)(jx + 0.5)); 34. REAL y = dy*((REAL)(jy + 0.5)); 35. REAL z = dz*((REAL)(jz + 0.5)); 36. REAL f0 = (REAL)0.125 37. *(1.0 - ax*cos(kx*x)) 38. *(1.0 - ay*cos(ky*y)) 39. *(1.0 - az*cos(kz*z)); 40. buff[j] = f0; 41. } 42. } 43. } 44. } 45. 46. REAL accuracy(const REAL *b1, REAL *b2, const int len) { 47. REAL err = 0.0; 48. int i; 49. for (i = 0; i < len; i++) { 50. err += (b1[i] - b2[i]) * (b1[i] - b2[i]); 51. } 52. return (REAL)sqrt(err/len); 53. } 54. 55. void diffusion_baseline(REAL *f1, REAL *f2, int nx, int ny, int nz, 56. REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, 57. REAL cb, REAL cc, REAL dt, 58. int count) { 59. int i; 60. for (i = 0; i < count; ++i) { 61. for (int z = 0; z < nz; z++) { 62. for (int y = 0; y < ny; y++) { 63. for (int x = 0; x < nx; x++) { 64. int c, w, e, n, s, b, t; 65. c = x + y * NXP + z * NXP * ny; 66. w = (x == 0) ? c : c - 1; 67. e = (x == NXP-1) ? c : c + 1; 68. n = (y == 0) ? c : c - NXP; 69. s = (y == ny-1) ? c : c + NXP; 70. b = (z == 0) ? c : c - NXP * ny; 71. t = (z == nz-1) ? c : c + NXP * ny; 72. f2[c] = cc * f1[c] + cw * f1[w] + ce * f1[e] 73. + cs * f1[s] + cn * f1[n] + cb * f1[b] + ct * f1[t]; 74. } 75. } 76. } 77. REAL *t = f1; 78. f1 = f2; 79. f2 = t; 80. } 81. return; 82. } 83. 84. static double cur_second(void) { 85. struct timeval tv; 86. gettimeofday(&tv, NULL); 87. return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; 88. } 89. 90. 91. void dump_result(REAL *f, int nx, int ny, int nz, char *out_path) { 92. FILE *out = fopen(out_path, "w"); 93. assert(out); 94. size_t nitems = nx * ny * nz; 95. fwrite(f, sizeof(REAL), nitems, out); 96. fclose(out); 97. } 98. 99. int main(int argc, char *argv[]) 100. { 101. 102. struct timeval time_begin, time_end; 103. 104. int nx = NX; 105. int ny = NX; 106. int nz = NX; 107. 108. REAL *f1 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX); 109. REAL *f2 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX); 110. assert(f1 != MAP_FAILED); 111. assert(f2 != MAP_FAILED); 112. REAL *answer = (REAL *)malloc(sizeof(REAL) * NXP*ny*nz); 113. REAL *f_final = NULL; 114. 115. REAL time = 0.0; 116. int count = 0; 117. int nthreads; 118. 119. REAL l, dx, dy, dz, kx, ky, kz, kappa, dt; 120. REAL ce, cw, cn, cs, ct, cb, cc; 121. 122. #pragma omp parallel 123. #pragma omp master 124. nthreads = omp_get_num_threads(); 125. 126. l = 1.0; 127. kappa = 0.1; 128. dx = dy = dz = l / nx; 129. kx = ky = kz = 2.0 * M_PI; 130. dt = 0.1*dx*dx / kappa; 131. count = 0.1 / dt; 132. f_final = (count % 2)? f2 : f1; 133. 134. init(f1, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time); 135. 136. ce = cw = kappa*dt/(dx*dx); 137. cn = cs = kappa*dt/(dy*dy); 138. ct = cb = kappa*dt/(dz*dz); 139. cc = 1.0 - (ce + cw + cn + cs + ct + cb); 140. 141. printf("Running diffusion kernel %d times\n", count); fflush(stdout); 142. gettimeofday(&time_begin, NULL); 143. diffusion_baseline(f1, f2, nx, ny, nz, ce, cw, cn, cs, ct, cb, cc, 144. dt, count); 145. gettimeofday(&time_end, NULL); 146. time = count * dt; 147. dump_result(f_final, nx, ny, nz, "diffusion_result.dat"); 148. 149. init(answer, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time); 150. REAL err = accuracy(f_final, answer, nx*ny*nz); 151. double elapsed_time = (time_end.tv_sec - time_begin.tv_sec) 152. + (time_end.tv_usec - time_begin.tv_usec)*1.0e-6; 153. REAL mflops = (nx*ny*nz)*13.0*count/elapsed_time * 1.0e-06; 154. double thput = (nx * ny * nz) * sizeof(REAL) * 3.0 * count 155. / elapsed_time * 1.0e-09; 156. 157. fprintf(stderr, "Elapsed time : %.3f (s)\n", elapsed_time); 158. fprintf(stderr, "FLOPS : %.3f (MFlops)\n", mflops); 159. fprintf(stderr, "Throughput : %.3f (GB/s)\n", thput); 160. fprintf(stderr, "Accuracy : %e\n", err); 161. 162. free(f1); 163. free(f2); 164. return 0; 165. }