▶ 使用Jacobi 迭代求泊松方程的数值解
● 使用 routine 导语封装模平方函数,把 u1 放进设备端再次减少拷贝开销,数组初始化也在设备中完成
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <math.h> 4 #include <time.h> 5 #include <openacc.h> 6 7 #if defined(_WIN32) || defined(_WIN64) 8 #include <C:Program FilesPGIwin6419.4includewrapsys imeb.h> 9 #define timestruct clock_t 10 #define gettime(a) (*(a) = clock()) 11 #define usec(t1,t2) (t2 - t1) 12 #else 13 #include <sys/time.h> 14 #define gettime(a) gettimeofday(a, NULL) 15 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec) 16 typedef struct timeval timestruct; 17 #endif 18 19 #pragma acc routine seq 20 inline float uval(float x, float y) 21 { 22 return x * x + y * y; 23 } 24 25 int main() 26 { 27 const int row = 8191, col = 1023; 28 const float height = 1.0, width = 2.0; 29 const float hx = height / row, wy = width / col; 30 const float fij = -4.0f; 31 const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2)); 32 const int maxIter = 100; 33 const int colPlus = col + 1; 34 35 float *restrict u0 = (float *)malloc(sizeof(float)*(row + 1)*colPlus); 36 float *restrict u1 = (float *)malloc(sizeof(float)*(row + 1)*colPlus); 37 float *utemp = NULL; 38 39 timestruct t1, t2; 40 acc_init(acc_device_nvidia); 41 gettime(&t1); 42 #pragma acc data copyout(u0[0:(row + 1) * colPlus]) create(u1[0:(row + 1) * colPlus]) 43 { 44 #pragma acc kernels present(u0[0:((row + 1) * colPlus)]) 45 { 46 #pragma acc loop independent 47 for (int i = 0; i < (row + 1)*(col + 1); i++) 48 u0[i] = 0.0f; 49 #pragma acc loop independent 50 for (int ix = 0; ix <= row; ix++) 51 u0[ix * colPlus + 0] = uval(ix * hx, 0.0f); 52 #pragma acc loop independent 53 for (int ix = 0; ix <= row; ix++) 54 u0[ix * colPlus + col] = uval(ix * hx, col * wy); 55 #pragma acc loop independent 56 for (int jy = 0; jy <= col; jy++) 57 u0[jy] = uval(0.0f, jy * wy); 58 #pragma acc loop independent 59 for (int jy = 0; jy <= col; jy++) 60 u0[row * colPlus + jy] = u1[row * colPlus + jy] = uval(row * hx, jy * wy); 61 } 62 63 for (int iter = 0; iter < maxIter; iter++) 64 { 65 #pragma acc kernels present(utemp, u0[0:((row + 1) * colPlus)], u1[0:((row + 1) * colPlus)]) 66 { 67 #pragma acc loop independent 68 for (int ix = 1; ix < row; ix++) 69 { 70 #pragma acc loop independent 71 for (int jy = 1; jy < col; jy++) 72 { 73 u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - 1)*colPlus + jy] + u0[(ix + 1)*colPlus + jy]) + 74 hx2 * (u0[ix*colPlus + jy - 1] + u0[ix*colPlus + jy + 1])) * c2; 75 } 76 } 77 } 78 utemp = u0, u0 = u1, u1 = utemp; 79 #pragma acc wait 80 } 81 } 82 gettime(&t2); 83 84 long long timeElapse = usec(t1, t2); 85 #if defined(_WIN32) || defined(_WIN64) 86 printf(" Elapsed time: %13ld ms. ", timeElapse); 87 #else 88 printf(" Elapsed time: %13ld us. ", timeElapse); 89 #endif 90 free(u0); 91 free(u1); 92 acc_shutdown(acc_device_nvidia); 93 //getchar(); 94 return 0; 95 }
● 输出结果,win10 中nvvp显示总时间减少到了 247 ms,计算时间 63 ms 几乎不变
D:CodeOpenACC>pgcc main.c -o main.exe -c99 -Minfo -acc main: 42, Generating copyout(u0[:colPlus*(row+1)]) Generating create(u1[:colPlus*(row+1)]) 44, Generating present(u0[:colPlus*(row+1)]) 47, Loop is parallelizable Generating Tesla code 47, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 47, Memory zero idiom, loop replaced by call to __c_mzero4 50, Loop is parallelizable Generating Tesla code 50, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 53, Loop is parallelizable Generating Tesla code 53, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 56, Loop is parallelizable Generating Tesla code 56, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 59, Loop is parallelizable Generating Tesla code 59, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 63, Generating present(utemp[:],u1[:colPlus*(row+1)],u0[:colPlus*(row+1)]) 68, Loop is parallelizable 71, Loop is parallelizable Generating Tesla code 68, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ 71, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 71, FMA (fused multiply-add) instruction(s) generated uval: 21, Generating acc routine seq Generating Tesla code 22, FMA (fused multiply-add) instruction(s) generated D:CodeOpenACC>main.exe Elapsed time: 63 ms.
● 输出结果,Ubuntu 中时间也减少了少许
... Elapsed time: 63274 us. Accelerator Kernel Timing data /home/cuan/my.c main NVIDIA devicenum=0 time(us): 36,680 42: data region reached 2 times 82: data copyout transfers: 3 device time(us): total=2,587 max=1,299 min=10 avg=862 44: compute region reached 1 time 47: kernel launched 1 time grid: [65535] block: [128] device time(us): total=155 max=155 min=155 avg=155 elapsed time(us): total=317 max=317 min=317 avg=317 50: kernel launched 1 time grid: [64] block: [128] device time(us): total=5 max=5 min=5 avg=5 elapsed time(us): total=17 max=17 min=17 avg=17 53: kernel launched 1 time grid: [64] block: [128] device time(us): total=5 max=5 min=5 avg=5 elapsed time(us): total=16 max=16 min=16 avg=16 56: kernel launched 1 time grid: [8] block: [128] device time(us): total=1 max=1 min=1 avg=1 elapsed time(us): total=12 max=12 min=12 avg=12 59: kernel launched 1 time grid: [8] block: [128] device time(us): total=2 max=2 min=2 avg=2 elapsed time(us): total=13 max=13 min=13 avg=13 44: data region reached 2 times 63: data region reached 200 times 65: compute region reached 100 times 71: kernel launched 100 times grid: [32x1024] block: [32x4] device time(us): total=33,925 max=358 min=336 avg=339 elapsed time(us): total=36,792 max=872 min=347 avg=367