HPMC User Guide v 1.00
© 2022 Bassem W. Jamaleddine
This chapter shows how to use your HPMC calculator to write OMP programs to be run on the MIC. The programs are being offloaded to the MIC Co-Processor.
■ Running the OMP Program on the MIC
The diffusion OMP program
1. #include <stdio.h> 2. #include <stdlib.h> 3. #include <string.h> 4. #include <math.h> 5. #include <time.h> 6. #include <sys/time.h> 7. #include <omp.h> 8. #include <assert.h> 9. #include <sys/mman.h> 10. 11. #define REAL float 12. #define NX (256) 13. #define NXP nx 14. 15. #ifndef M_PI 16. #define M_PI (3.1415926535897932384626) 17. #endif 18. 19. 20. void init(REAL *buff, const int nx, const int ny, const int nz, 21. const REAL kx, const REAL ky, const REAL kz, 22. const REAL dx, const REAL dy, const REAL dz, 23. const REAL kappa, const REAL time) { 24. REAL ax, ay, az; 25. int jz, jy, jx; 26. ax = exp(-kappa*time*(kx*kx)); 27. ay = exp(-kappa*time*(ky*ky)); 28. az = exp(-kappa*time*(kz*kz)); 29. for (jz = 0; jz < nz; jz++) { 30. for (jy = 0; jy < ny; jy++) { 31. for (jx = 0; jx < nx; jx++) { 32. int j = jz*NXP*ny + jy*NXP + jx; 33. REAL x = dx*((REAL)(jx + 0.5)); 34. REAL y = dy*((REAL)(jy + 0.5)); 35. REAL z = dz*((REAL)(jz + 0.5)); 36. REAL f0 = (REAL)0.125 37. *(1.0 - ax*cos(kx*x)) 38. *(1.0 - ay*cos(ky*y)) 39. *(1.0 - az*cos(kz*z)); 40. buff[j] = f0; 41. } 42. } 43. } 44. } 45. 46. REAL accuracy(const REAL *b1, REAL *b2, const int len) { 47. REAL err = 0.0; 48. int i; 49. for (i = 0; i < len; i++) { 50. err += (b1[i] - b2[i]) * (b1[i] - b2[i]); 51. } 52. return (REAL)sqrt(err/len); 53. } 54. 55. void diffusion_openmp(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz, 56. REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, 57. REAL cb, REAL cc, REAL dt, int count) { 58. #pragma omp parallel 59. { 60. REAL *f1_t = f1; 61. REAL *f2_t = f2; 62. 63. for (int i = 0; i < count; ++i) { 64. #pragma omp for collapse(2) 65. for (int z = 0; z < nz; z++) { 66. for (int y = 0; y < ny; y++) { 67. for (int x = 0; x < nx; x++) { 68. int c, w, e, n, s, b, t; 69. c = x + y * NXP + z * NXP * ny; 70. w = (x == 0) ? c : c - 1; 71. e = (x == NXP-1) ? c : c + 1; 72. n = (y == 0) ? c : c - NXP; 73. s = (y == ny-1) ? c : c + NXP; 74. b = (z == 0) ? c : c - NXP * ny; 75. t = (z == nz-1) ? c : c + NXP * ny; 76. f2_t[c] = cc * f1_t[c] + cw * f1_t[w] + ce * f1_t[e] 77. + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t]; 78. } 79. } 80. } 81. REAL *t = f1_t; 82. f1_t = f2_t; 83. f2_t = t; 84. } 85. } 86. return; 87. } 88. 89. 90. static double cur_second(void) { 91. struct timeval tv; 92. gettimeofday(&tv, NULL); 93. return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; 94. } 95. 96. 97. void dump_result(REAL *f, int nx, int ny, int nz, char *out_path) { 98. FILE *out = fopen(out_path, "w"); 99. assert(out); 100. size_t nitems = nx * ny * nz; 101. fwrite(f, sizeof(REAL), nitems, out); 102. fclose(out); 103. } 104. 105. int main(int argc, char *argv[]) 106. { 107. 108. struct timeval time_begin, time_end; 109. 110. int nx = NX; 111. int ny = NX; 112. int nz = NX; 113. 114. REAL *f1 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX); 115. REAL *f2 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX); 116. assert(f1 != MAP_FAILED); 117. assert(f2 != MAP_FAILED); 118. REAL *answer = (REAL *)malloc(sizeof(REAL) * NXP*ny*nz); 119. REAL *f_final = NULL; 120. 121. REAL time = 0.0; 122. int count = 0; 123. int nthreads; 124. 125. REAL l, dx, dy, dz, kx, ky, kz, kappa, dt; 126. REAL ce, cw, cn, cs, ct, cb, cc; 127. 128. #pragma omp parallel 129. #pragma omp master 130. nthreads = omp_get_num_threads(); 131. 132. l = 1.0; 133. kappa = 0.1; 134. dx = dy = dz = l / nx; 135. kx = ky = kz = 2.0 * M_PI; 136. dt = 0.1*dx*dx / kappa; 137. count = 0.1 / dt; 138. f_final = (count % 2)? f2 : f1; 139. 140. init(f1, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time); 141. 142. ce = cw = kappa*dt/(dx*dx); 143. cn = cs = kappa*dt/(dy*dy); 144. ct = cb = kappa*dt/(dz*dz); 145. cc = 1.0 - (ce + cw + cn + cs + ct + cb); 146. 147. printf("Running diffusion kernel %d times\n", count); fflush(stdout); 148. gettimeofday(&time_begin, NULL); 149. diffusion_openmp(f1, f2, nx, ny, nz, ce, cw, cn, cs, ct, cb, cc, 150. dt, count); 151. gettimeofday(&time_end, NULL); 152. time = count * dt; 153. dump_result(f_final, nx, ny, nz, "diffusion_result.dat"); 154. 155. init(answer, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time); 156. REAL err = accuracy(f_final, answer, nx*ny*nz); 157. double elapsed_time = (time_end.tv_sec - time_begin.tv_sec) 158. + (time_end.tv_usec - time_begin.tv_usec)*1.0e-6; 159. REAL mflops = (nx*ny*nz)*13.0*count/elapsed_time * 1.0e-06; 160. double thput = (nx * ny * nz) * sizeof(REAL) * 3.0 * count 161. / elapsed_time * 1.0e-09; 162. 163. fprintf(stderr, "Elapsed time : %.3f (s)\n", elapsed_time); 164. fprintf(stderr, "FLOPS : %.3f (MFlops)\n", mflops); 165. fprintf(stderr, "Throughput : %.3f (GB/s)\n", thput); 166. fprintf(stderr, "Accuracy : %e\n", err); 167. 168. free(f1); 169. free(f2); 170. return 0; 171. }
On the host hpmc9 compile the program diffusion_omp.c
# icc -qopenmp -O3 -std=c99 -mmic diffusion_omp.c -o diffusion_omp_xphi
# scp diffusion_omp_xphi mic0:/root
Run the code on the MIC
# (HPMC9) 00:42 root@mic0: ~ # ./diffusion_omp_xphi
(HPMC9) 00:42 root@mic0: ~ # ./diffusion_omp_xphi Running diffusion kernel 6553 times Elapsed time : 129.454 (s) FLOPS : 11040.484 (MFlops) Throughput : 10.191 (GB/s) Accuracy : 1.363216e-05
On the MIC we can reveal the processes run by the program diffusion_omp_x.
# (HPMC9) 00:41 root@mic0: ~ # pstree -up
(HPMC9) 00:41 root@mic0: ~ # pstree -up
init(1)-+-coi_daemon(4354,micuser)---{coi_daemon}(4356)
|-getty(4370)
|-klogd(4333)
|-mpssd(4339)-+-{mpssd}(4341)
| `-{mpssd}(4342)
|-portmap(4290,1)
|-sshd(4322)-+-sshd(4374)---bash(4376)---diffusion_omp_x(4387)-+-{diffusion_omp_x}(4388)
| | |-{diffusion_omp_x}(4389)
| | |-{diffusion_omp_x}(4390)
| | |-{diffusion_omp_x}(4391)
| | |-{diffusion_omp_x}(4392)
| | |-{diffusion_omp_x}(4393)
| | |-{diffusion_omp_x}(4394)
| | |-{diffusion_omp_x}(4395)
| | |-{diffusion_omp_x}(4396)
| | |-{diffusion_omp_x}(4397)
| | |-{diffusion_omp_x}(4398)
...
| | |-{diffusion_omp_x}(4604)
| | |-{diffusion_omp_x}(4605)
| | |-{diffusion_omp_x}(4606)
| | |-{diffusion_omp_x}(4607)
| | |-{diffusion_omp_x}(4608)
| | |-{diffusion_omp_x}(4609)
| | |-{diffusion_omp_x}(4610)
| | |-{diffusion_omp_x}(4611)
| | |-{diffusion_omp_x}(4612)
| | |-{diffusion_omp_x}(4613)
| | |-{diffusion_omp_x}(4614)
| | `-{diffusion_omp_x}(4615)
| `-sshd(4616)---bash(4618)---pstree(4621)
|-syslogd(4330)
`-udevd(4113)-+-udevd(4336)
`-udevd(4363)
■ MIC Native Loading: micnativeloadex
The diffusion OMP program
1. #include <stdio.h> 2. #include <stdlib.h> 3. #include <string.h> 4. #include <math.h> 5. #include <time.h> 6. #include <sys/time.h> 7. #include <omp.h> 8. #include <assert.h> 9. #include <sys/mman.h> 10. 11. #define REAL float 12. #define NX (256) 13. #define NXP nx 14. 15. #ifndef M_PI 16. #define M_PI (3.1415926535897932384626) 17. #endif 18. 19. 20. void init(REAL *buff, const int nx, const int ny, const int nz, 21. const REAL kx, const REAL ky, const REAL kz, 22. const REAL dx, const REAL dy, const REAL dz, 23. const REAL kappa, const REAL time) { 24. REAL ax, ay, az; 25. int jz, jy, jx; 26. ax = exp(-kappa*time*(kx*kx)); 27. ay = exp(-kappa*time*(ky*ky)); 28. az = exp(-kappa*time*(kz*kz)); 29. for (jz = 0; jz < nz; jz++) { 30. for (jy = 0; jy < ny; jy++) { 31. for (jx = 0; jx < nx; jx++) { 32. int j = jz*NXP*ny + jy*NXP + jx; 33. REAL x = dx*((REAL)(jx + 0.5)); 34. REAL y = dy*((REAL)(jy + 0.5)); 35. REAL z = dz*((REAL)(jz + 0.5)); 36. REAL f0 = (REAL)0.125 37. *(1.0 - ax*cos(kx*x)) 38. *(1.0 - ay*cos(ky*y)) 39. *(1.0 - az*cos(kz*z)); 40. buff[j] = f0; 41. } 42. } 43. } 44. } 45. 46. REAL accuracy(const REAL *b1, REAL *b2, const int len) { 47. REAL err = 0.0; 48. int i; 49. for (i = 0; i < len; i++) { 50. err += (b1[i] - b2[i]) * (b1[i] - b2[i]); 51. } 52. return (REAL)sqrt(err/len); 53. } 54. 55. void diffusion_baseline(REAL *f1, REAL *f2, int nx, int ny, int nz, 56. REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, 57. REAL cb, REAL cc, REAL dt, 58. int count) { 59. int i; 60. for (i = 0; i < count; ++i) { 61. for (int z = 0; z < nz; z++) { 62. for (int y = 0; y < ny; y++) { 63. for (int x = 0; x < nx; x++) { 64. int c, w, e, n, s, b, t; 65. c = x + y * NXP + z * NXP * ny; 66. w = (x == 0) ? c : c - 1; 67. e = (x == NXP-1) ? c : c + 1; 68. n = (y == 0) ? c : c - NXP; 69. s = (y == ny-1) ? c : c + NXP; 70. b = (z == 0) ? c : c - NXP * ny; 71. t = (z == nz-1) ? c : c + NXP * ny; 72. f2[c] = cc * f1[c] + cw * f1[w] + ce * f1[e] 73. + cs * f1[s] + cn * f1[n] + cb * f1[b] + ct * f1[t]; 74. } 75. } 76. } 77. REAL *t = f1; 78. f1 = f2; 79. f2 = t; 80. } 81. return; 82. } 83. 84. static double cur_second(void) { 85. struct timeval tv; 86. gettimeofday(&tv, NULL); 87. return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; 88. } 89. 90. 91. void dump_result(REAL *f, int nx, int ny, int nz, char *out_path) { 92. FILE *out = fopen(out_path, "w"); 93. assert(out); 94. size_t nitems = nx * ny * nz; 95. fwrite(f, sizeof(REAL), nitems, out); 96. fclose(out); 97. } 98. 99. int main(int argc, char *argv[]) 100. { 101. 102. struct timeval time_begin, time_end; 103. 104. int nx = NX; 105. int ny = NX; 106. int nz = NX; 107. 108. REAL *f1 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX); 109. REAL *f2 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX); 110. assert(f1 != MAP_FAILED); 111. assert(f2 != MAP_FAILED); 112. REAL *answer = (REAL *)malloc(sizeof(REAL) * NXP*ny*nz); 113. REAL *f_final = NULL; 114. 115. REAL time = 0.0; 116. int count = 0; 117. int nthreads; 118. 119. REAL l, dx, dy, dz, kx, ky, kz, kappa, dt; 120. REAL ce, cw, cn, cs, ct, cb, cc; 121. 122. #pragma omp parallel 123. #pragma omp master 124. nthreads = omp_get_num_threads(); 125. 126. l = 1.0; 127. kappa = 0.1; 128. dx = dy = dz = l / nx; 129. kx = ky = kz = 2.0 * M_PI; 130. dt = 0.1*dx*dx / kappa; 131. count = 0.1 / dt; 132. f_final = (count % 2)? f2 : f1; 133. 134. init(f1, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time); 135. 136. ce = cw = kappa*dt/(dx*dx); 137. cn = cs = kappa*dt/(dy*dy); 138. ct = cb = kappa*dt/(dz*dz); 139. cc = 1.0 - (ce + cw + cn + cs + ct + cb); 140. 141. printf("Running diffusion kernel %d times\n", count); fflush(stdout); 142. gettimeofday(&time_begin, NULL); 143. diffusion_baseline(f1, f2, nx, ny, nz, ce, cw, cn, cs, ct, cb, cc, 144. dt, count); 145. gettimeofday(&time_end, NULL); 146. time = count * dt; 147. dump_result(f_final, nx, ny, nz, "diffusion_result.dat"); 148. 149. init(answer, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time); 150. REAL err = accuracy(f_final, answer, nx*ny*nz); 151. double elapsed_time = (time_end.tv_sec - time_begin.tv_sec) 152. + (time_end.tv_usec - time_begin.tv_usec)*1.0e-6; 153. REAL mflops = (nx*ny*nz)*13.0*count/elapsed_time * 1.0e-06; 154. double thput = (nx * ny * nz) * sizeof(REAL) * 3.0 * count 155. / elapsed_time * 1.0e-09; 156. 157. fprintf(stderr, "Elapsed time : %.3f (s)\n", elapsed_time); 158. fprintf(stderr, "FLOPS : %.3f (MFlops)\n", mflops); 159. fprintf(stderr, "Throughput : %.3f (GB/s)\n", thput); 160. fprintf(stderr, "Accuracy : %e\n", err); 161. 162. free(f1); 163. free(f2); 164. return 0; 165. }
On the host hpmc9 compile the diffusion_base.c
# icc -qopenmp -O3 -std=c99 -mmic diffusion_base.c -o diffusion_base_xphi
Run the code on the host hpmc9
# 00:25 root@HPMC9: micnativeloadex diffusion_base_xphi -d mic0 -e "SINK_LD_LIBRARY_PATH=/miclib/"
00:25 root@HPMC9: ~/Code # micnativeloadex diffusion_base_xphi -d mic0 -e "SINK_LD_LIBRARY_PATH=/miclib/" Running diffusion kernel 409 times Elapsed time : 5.293 (s) FLOPS : 263.316 (MFlops) Throughput : 0.243 (GB/s) Accuracy : 1.749954e-05
On the MIC we can reveal the processes run by the program diffusion_base.
(HPMC9) 02:39 root@mic0: ~ # pstree -up
init(1)-+-coi_daemon(4354,micuser)-+-diffusion_base_(6024)-+-{diffusion_base_}(6030)
| | |-{diffusion_base_}(6031)
| | |-{diffusion_base_}(6032)
| | |-{diffusion_base_}(6033)
| | |-{diffusion_base_}(6034)
| | |-{diffusion_base_}(6035)
| | |-{diffusion_base_}(6036)
| | |-{diffusion_base_}(6037)
| | |-{diffusion_base_}(6038)
...
| | |-{diffusion_base_}(6245)
| | |-{diffusion_base_}(6246)
| | |-{diffusion_base_}(6247)
| | |-{diffusion_base_}(6248)
| | |-{diffusion_base_}(6249)
| | |-{diffusion_base_}(6250)
| | |-{diffusion_base_}(6251)
| | |-{diffusion_base_}(6252)
| | `-{diffusion_base_}(6253)
| |-{coi_daemon}(4356)
| `-{coi_daemon}(6026)
|-getty(4370)
|-klogd(4333)
|-mpssd(4339)-+-{mpssd}(4341)
| `-{mpssd}(4342)
|-portmap(4290,1)
|-sshd(4322)-+-sshd(4374)---bash(4376)
| `-sshd(4616)---bash(4618)---pstree(6254)
|-syslogd(4330)
`-udevd(4113)-+-udevd(4336)
`-udevd(4363)