HPMC User Guide v 1.00
© 2022 Bassem W. Jamaleddine


2-3

   Programming and Running by MIC Native Loading

This chapter shows how to use your HPMC calculator to write OMP programs to be run on the MIC. The programs are being offloaded to the MIC Co-Processor.

■ Running the OMP Program on the MIC


The diffusion OMP program

    --  Program Code 2.3.1 :      [LISTING diffusion_omp.c] - [Diffusion OMP]
(raw text)
1.     #include <stdio.h>
2.     #include <stdlib.h>
3.     #include <string.h>
4.     #include <math.h>
5.     #include <time.h>
6.     #include <sys/time.h>
7.     #include <omp.h>
8.     #include <assert.h>
9.     #include <sys/mman.h>
10.    
11.    #define REAL float
12.    #define NX (256)
13.    #define NXP nx
14.    
15.    #ifndef M_PI
16.    #define M_PI (3.1415926535897932384626)
17.    #endif
18.    
19.    
20.    void init(REAL *buff, const int nx, const int ny, const int nz,
21.              const REAL kx, const REAL ky, const REAL kz,
22.              const REAL dx, const REAL dy, const REAL dz,
23.              const REAL kappa, const REAL time) {
24.      REAL ax, ay, az;
25.      int jz, jy, jx;
26.      ax = exp(-kappa*time*(kx*kx));
27.      ay = exp(-kappa*time*(ky*ky));
28.      az = exp(-kappa*time*(kz*kz));
29.      for (jz = 0; jz < nz; jz++) {
30.        for (jy = 0; jy < ny; jy++) {
31.          for (jx = 0; jx < nx; jx++) {
32.            int j = jz*NXP*ny + jy*NXP + jx;
33.            REAL x = dx*((REAL)(jx + 0.5));
34.            REAL y = dy*((REAL)(jy + 0.5));
35.            REAL z = dz*((REAL)(jz + 0.5));
36.            REAL f0 = (REAL)0.125
37.              *(1.0 - ax*cos(kx*x))
38.              *(1.0 - ay*cos(ky*y))
39.              *(1.0 - az*cos(kz*z));
40.            buff[j] = f0;
41.          }
42.        }
43.      }
44.    }
45.    
46.    REAL accuracy(const REAL *b1, REAL *b2, const int len) {
47.      REAL err = 0.0;
48.      int i;
49.      for (i = 0; i < len; i++) {
50.        err += (b1[i] - b2[i]) * (b1[i] - b2[i]);
51.      }
52.      return (REAL)sqrt(err/len);
53.    }
54.    
55.    void diffusion_openmp(REAL *restrict f1, REAL *restrict f2, int nx, int 
      ny, int nz, 
56.                     REAL ce, REAL cw, REAL cn, REAL cs, REAL ct,
57.                     REAL cb, REAL cc, REAL dt, int count) {
58.    #pragma omp parallel
59.      {
60.        REAL *f1_t = f1;
61.        REAL *f2_t = f2;
62.    
63.        for (int i = 0; i < count; ++i) {
64.    #pragma omp for collapse(2)
65.          for (int z = 0; z < nz; z++) {
66.            for (int y = 0; y < ny; y++) {
67.              for (int x = 0; x < nx; x++) {
68.                int c, w, e, n, s, b, t;
69.                c =  x + y * NXP + z * NXP * ny;
70.                w = (x == 0)    ? c : c - 1;
71.                e = (x == NXP-1) ? c : c + 1;
72.                n = (y == 0)    ? c : c - NXP;
73.                s = (y == ny-1) ? c : c + NXP;
74.                b = (z == 0)    ? c : c - NXP * ny;
75.                t = (z == nz-1) ? c : c + NXP * ny;
76.                f2_t[c] = cc * f1_t[c] + cw * f1_t[w] + ce * f1_t[e]
77.                    + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * 
      f1_t[t]; 
78.              }
79.            }
80.          }
81.          REAL *t = f1_t;
82.          f1_t = f2_t;
83.          f2_t = t;
84.        }
85.      }
86.      return;
87.    }
88.    
89.    
90.    static double cur_second(void) {
91.      struct timeval tv;
92.      gettimeofday(&tv, NULL);
93.      return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
94.    }
95.    
96.    
97.    void dump_result(REAL *f, int nx, int ny, int nz, char *out_path) {
98.      FILE *out = fopen(out_path, "w");
99.      assert(out);
100.     size_t nitems = nx * ny * nz;
101.     fwrite(f, sizeof(REAL), nitems, out);
102.     fclose(out);
103.   }
104.   
105.   int main(int argc, char *argv[]) 
106.   {
107.     
108.     struct timeval time_begin, time_end;
109.   
110.     int    nx    = NX;
111.     int    ny    = NX;
112.     int    nz    = NX;
113.   
114.     REAL *f1 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX);
115.     REAL *f2 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX);
116.     assert(f1 != MAP_FAILED);
117.     assert(f2 != MAP_FAILED);
118.     REAL *answer = (REAL *)malloc(sizeof(REAL) * NXP*ny*nz);
119.     REAL *f_final = NULL;
120.   
121.     REAL   time  = 0.0;
122.     int    count = 0;  
123.     int    nthreads;
124.   
125.     REAL l, dx, dy, dz, kx, ky, kz, kappa, dt;
126.     REAL ce, cw, cn, cs, ct, cb, cc;
127.   
128.     #pragma omp parallel
129.     #pragma omp master
130.       nthreads = omp_get_num_threads();
131.   
132.     l = 1.0;
133.     kappa = 0.1;
134.     dx = dy = dz = l / nx;
135.     kx = ky = kz = 2.0 * M_PI;
136.     dt = 0.1*dx*dx / kappa;
137.     count = 0.1 / dt;
138.     f_final = (count % 2)? f2 : f1;
139.   
140.     init(f1, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time);
141.   
142.     ce = cw = kappa*dt/(dx*dx);
143.     cn = cs = kappa*dt/(dy*dy);
144.     ct = cb = kappa*dt/(dz*dz);
145.     cc = 1.0 - (ce + cw + cn + cs + ct + cb);
146.   
147.     printf("Running diffusion kernel %d times\n", count); fflush(stdout);
148.     gettimeofday(&time_begin, NULL);
149.     diffusion_openmp(f1, f2, nx, ny, nz, ce, cw, cn, cs, ct, cb, cc,
150.                    dt, count);
151.     gettimeofday(&time_end, NULL);
152.     time = count * dt;
153.     dump_result(f_final, nx, ny, nz, "diffusion_result.dat");
154.   
155.     init(answer, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time);
156.     REAL err = accuracy(f_final, answer, nx*ny*nz);
157.     double elapsed_time = (time_end.tv_sec - time_begin.tv_sec)
158.         + (time_end.tv_usec - time_begin.tv_usec)*1.0e-6;
159.     REAL mflops = (nx*ny*nz)*13.0*count/elapsed_time * 1.0e-06;
160.     double thput = (nx * ny * nz) * sizeof(REAL) * 3.0 * count
161.         / elapsed_time * 1.0e-09;
162.   
163.     fprintf(stderr, "Elapsed time : %.3f (s)\n", elapsed_time);
164.     fprintf(stderr, "FLOPS        : %.3f (MFlops)\n", mflops);
165.     fprintf(stderr, "Throughput   : %.3f (GB/s)\n", thput);  
166.     fprintf(stderr, "Accuracy     : %e\n", err);
167.     
168.     free(f1);
169.     free(f2);
170.     return 0;
171.   }

HPMC 2022




On the host hpmc9 compile the program diffusion_omp.c

icc -qopenmp -O3 -std=c99 -mmic diffusion_omp.c -o diffusion_omp_xphi

Copy the compiled code to the MIC

scp diffusion_omp_xphi mic0:/root



Run the code on the MIC

(HPMC9) 00:42 root@mic0: ~ # ./diffusion_omp_xphi

(HPMC9) 00:42 root@mic0: ~ #  ./diffusion_omp_xphi 
Running diffusion kernel 6553 times
Elapsed time : 129.454 (s)
FLOPS        : 11040.484 (MFlops)
Throughput   : 10.191 (GB/s)
Accuracy     : 1.363216e-05



On the MIC we can reveal the processes run by the program diffusion_omp_x.

(HPMC9) 00:41 root@mic0: ~ # pstree -up

(HPMC9) 00:41 root@mic0: ~ #  pstree -up
init(1)-+-coi_daemon(4354,micuser)---{coi_daemon}(4356)
        |-getty(4370)
        |-klogd(4333)
        |-mpssd(4339)-+-{mpssd}(4341)
        |             `-{mpssd}(4342)
        |-portmap(4290,1)
        |-sshd(4322)-+-sshd(4374)---bash(4376)---diffusion_omp_x(4387)-+-{diffusion_omp_x}(4388)
        |            |                                                 |-{diffusion_omp_x}(4389)
        |            |                                                 |-{diffusion_omp_x}(4390)
        |            |                                                 |-{diffusion_omp_x}(4391)
        |            |                                                 |-{diffusion_omp_x}(4392)
        |            |                                                 |-{diffusion_omp_x}(4393)
        |            |                                                 |-{diffusion_omp_x}(4394)
        |            |                                                 |-{diffusion_omp_x}(4395)
        |            |                                                 |-{diffusion_omp_x}(4396)
        |            |                                                 |-{diffusion_omp_x}(4397)
        |            |                                                 |-{diffusion_omp_x}(4398)
...
        |            |                                                 |-{diffusion_omp_x}(4604)
        |            |                                                 |-{diffusion_omp_x}(4605)
        |            |                                                 |-{diffusion_omp_x}(4606)
        |            |                                                 |-{diffusion_omp_x}(4607)
        |            |                                                 |-{diffusion_omp_x}(4608)
        |            |                                                 |-{diffusion_omp_x}(4609)
        |            |                                                 |-{diffusion_omp_x}(4610)
        |            |                                                 |-{diffusion_omp_x}(4611)
        |            |                                                 |-{diffusion_omp_x}(4612)
        |            |                                                 |-{diffusion_omp_x}(4613)
        |            |                                                 |-{diffusion_omp_x}(4614)
        |            |                                                 `-{diffusion_omp_x}(4615)
        |            `-sshd(4616)---bash(4618)---pstree(4621)
        |-syslogd(4330)
        `-udevd(4113)-+-udevd(4336)
                      `-udevd(4363)


 

■ MIC Native Loading: micnativeloadex

The diffusion OMP program

    --  Program Code 2.3.2 :      [LISTING diffusion_base.c] - [Diffusion Program]
(raw text)
1.     #include <stdio.h>
2.     #include <stdlib.h>
3.     #include <string.h>
4.     #include <math.h>
5.     #include <time.h>
6.     #include <sys/time.h>
7.     #include <omp.h>
8.     #include <assert.h>
9.     #include <sys/mman.h>
10.    
11.    #define REAL float
12.    #define NX (256)
13.    #define NXP nx
14.    
15.    #ifndef M_PI
16.    #define M_PI (3.1415926535897932384626)
17.    #endif
18.    
19.    
20.    void init(REAL *buff, const int nx, const int ny, const int nz,
21.              const REAL kx, const REAL ky, const REAL kz,
22.              const REAL dx, const REAL dy, const REAL dz,
23.              const REAL kappa, const REAL time) {
24.      REAL ax, ay, az;
25.      int jz, jy, jx;
26.      ax = exp(-kappa*time*(kx*kx));
27.      ay = exp(-kappa*time*(ky*ky));
28.      az = exp(-kappa*time*(kz*kz));
29.      for (jz = 0; jz < nz; jz++) {
30.        for (jy = 0; jy < ny; jy++) {
31.          for (jx = 0; jx < nx; jx++) {
32.            int j = jz*NXP*ny + jy*NXP + jx;
33.            REAL x = dx*((REAL)(jx + 0.5));
34.            REAL y = dy*((REAL)(jy + 0.5));
35.            REAL z = dz*((REAL)(jz + 0.5));
36.            REAL f0 = (REAL)0.125
37.              *(1.0 - ax*cos(kx*x))
38.              *(1.0 - ay*cos(ky*y))
39.              *(1.0 - az*cos(kz*z));
40.            buff[j] = f0;
41.          }
42.        }
43.      }
44.    }
45.    
46.    REAL accuracy(const REAL *b1, REAL *b2, const int len) {
47.      REAL err = 0.0;
48.      int i;
49.      for (i = 0; i < len; i++) {
50.        err += (b1[i] - b2[i]) * (b1[i] - b2[i]);
51.      }
52.      return (REAL)sqrt(err/len);
53.    }
54.    
55.    void diffusion_baseline(REAL *f1, REAL *f2, int nx, int ny, int nz,
56.                       REAL ce, REAL cw, REAL cn, REAL cs, REAL ct,
57.                       REAL cb, REAL cc, REAL dt,
58.                       int count) {
59.      int i;
60.      for (i = 0; i < count; ++i) {
61.        for (int z = 0; z < nz; z++) {
62.          for (int y = 0; y < ny; y++) {
63.            for (int x = 0; x < nx; x++) {
64.              int c, w, e, n, s, b, t;
65.              c =  x + y * NXP + z * NXP * ny;
66.              w = (x == 0)    ? c : c - 1;
67.              e = (x == NXP-1) ? c : c + 1;
68.              n = (y == 0)    ? c : c - NXP;
69.              s = (y == ny-1) ? c : c + NXP;
70.              b = (z == 0)    ? c : c - NXP * ny;
71.              t = (z == nz-1) ? c : c + NXP * ny;
72.              f2[c] = cc * f1[c] + cw * f1[w] + ce * f1[e]
73.                  + cs * f1[s] + cn * f1[n] + cb * f1[b] + ct * f1[t];
74.            }
75.          }
76.        }
77.        REAL *t = f1;
78.        f1 = f2;
79.        f2 = t;
80.      }
81.      return;
82.    }
83.    
84.    static double cur_second(void) {
85.      struct timeval tv;
86.      gettimeofday(&tv, NULL);
87.      return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
88.    }
89.    
90.    
91.    void dump_result(REAL *f, int nx, int ny, int nz, char *out_path) {
92.      FILE *out = fopen(out_path, "w");
93.      assert(out);
94.      size_t nitems = nx * ny * nz;
95.      fwrite(f, sizeof(REAL), nitems, out);
96.      fclose(out);
97.    }
98.    
99.    int main(int argc, char *argv[]) 
100.   {
101.     
102.     struct timeval time_begin, time_end;
103.   
104.     int    nx    = NX;
105.     int    ny    = NX;
106.     int    nz    = NX;
107.   
108.     REAL *f1 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX);
109.     REAL *f2 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX);
110.     assert(f1 != MAP_FAILED);
111.     assert(f2 != MAP_FAILED);
112.     REAL *answer = (REAL *)malloc(sizeof(REAL) * NXP*ny*nz);
113.     REAL *f_final = NULL;
114.   
115.     REAL   time  = 0.0;
116.     int    count = 0;  
117.     int    nthreads;
118.   
119.     REAL l, dx, dy, dz, kx, ky, kz, kappa, dt;
120.     REAL ce, cw, cn, cs, ct, cb, cc;
121.   
122.     #pragma omp parallel
123.     #pragma omp master
124.       nthreads = omp_get_num_threads();
125.   
126.     l = 1.0;
127.     kappa = 0.1;
128.     dx = dy = dz = l / nx;
129.     kx = ky = kz = 2.0 * M_PI;
130.     dt = 0.1*dx*dx / kappa;
131.     count = 0.1 / dt;
132.     f_final = (count % 2)? f2 : f1;
133.   
134.     init(f1, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time);
135.   
136.     ce = cw = kappa*dt/(dx*dx);
137.     cn = cs = kappa*dt/(dy*dy);
138.     ct = cb = kappa*dt/(dz*dz);
139.     cc = 1.0 - (ce + cw + cn + cs + ct + cb);
140.   
141.     printf("Running diffusion kernel %d times\n", count); fflush(stdout);
142.     gettimeofday(&time_begin, NULL);
143.     diffusion_baseline(f1, f2, nx, ny, nz, ce, cw, cn, cs, ct, cb, cc,
144.                    dt, count);
145.     gettimeofday(&time_end, NULL);
146.     time = count * dt;
147.     dump_result(f_final, nx, ny, nz, "diffusion_result.dat");
148.   
149.     init(answer, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time);
150.     REAL err = accuracy(f_final, answer, nx*ny*nz);
151.     double elapsed_time = (time_end.tv_sec - time_begin.tv_sec)
152.         + (time_end.tv_usec - time_begin.tv_usec)*1.0e-6;
153.     REAL mflops = (nx*ny*nz)*13.0*count/elapsed_time * 1.0e-06;
154.     double thput = (nx * ny * nz) * sizeof(REAL) * 3.0 * count
155.         / elapsed_time * 1.0e-09;
156.   
157.     fprintf(stderr, "Elapsed time : %.3f (s)\n", elapsed_time);
158.     fprintf(stderr, "FLOPS        : %.3f (MFlops)\n", mflops);
159.     fprintf(stderr, "Throughput   : %.3f (GB/s)\n", thput);  
160.     fprintf(stderr, "Accuracy     : %e\n", err);
161.     
162.     free(f1);
163.     free(f2);
164.     return 0;
165.   }

HPMC 2022




On the host hpmc9 compile the diffusion_base.c

icc -qopenmp -O3 -std=c99 -mmic diffusion_base.c -o diffusion_base_xphi



Run the code on the host hpmc9

00:25 root@HPMC9: micnativeloadex diffusion_base_xphi -d mic0 -e "SINK_LD_LIBRARY_PATH=/miclib/"

00:25 root@HPMC9: ~/Code #  micnativeloadex diffusion_base_xphi -d mic0 -e "SINK_LD_LIBRARY_PATH=/miclib/" 
Running diffusion kernel 409 times
Elapsed time : 5.293 (s)
FLOPS        : 263.316 (MFlops)
Throughput   : 0.243 (GB/s)
Accuracy     : 1.749954e-05


On the MIC we can reveal the processes run by the program diffusion_base.
(HPMC9) 02:39 root@mic0: ~ #  pstree -up
init(1)-+-coi_daemon(4354,micuser)-+-diffusion_base_(6024)-+-{diffusion_base_}(6030)
        |                          |                       |-{diffusion_base_}(6031)
        |                          |                       |-{diffusion_base_}(6032)
        |                          |                       |-{diffusion_base_}(6033)
        |                          |                       |-{diffusion_base_}(6034)
        |                          |                       |-{diffusion_base_}(6035)
        |                          |                       |-{diffusion_base_}(6036)
        |                          |                       |-{diffusion_base_}(6037)
        |                          |                       |-{diffusion_base_}(6038)
...
        |                          |                       |-{diffusion_base_}(6245)
        |                          |                       |-{diffusion_base_}(6246)
        |                          |                       |-{diffusion_base_}(6247)
        |                          |                       |-{diffusion_base_}(6248)
        |                          |                       |-{diffusion_base_}(6249)
        |                          |                       |-{diffusion_base_}(6250)
        |                          |                       |-{diffusion_base_}(6251)
        |                          |                       |-{diffusion_base_}(6252)
        |                          |                       `-{diffusion_base_}(6253)
        |                          |-{coi_daemon}(4356)
        |                          `-{coi_daemon}(6026)
        |-getty(4370)
        |-klogd(4333)
        |-mpssd(4339)-+-{mpssd}(4341)
        |             `-{mpssd}(4342)
        |-portmap(4290,1)
        |-sshd(4322)-+-sshd(4374)---bash(4376)
        |            `-sshd(4616)---bash(4618)---pstree(6254)
        |-syslogd(4330)
        `-udevd(4113)-+-udevd(4336)
                      `-udevd(4363)