▶ 使用 routine 构件创建的自定义函数,在并行调用上的差别
● 代码,自定义一个 sqab 函数,使用内建函数 fabsf 和 sqrtf 计算一个矩阵所有元素绝对值的平方根
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <math.h> 4 #include <openacc.h> 5 6 #define ROW 8 7 #define COL 64 8 9 #pragma acc routine vector 10 void sqab(float *a, const int m) 11 { 12 #pragma acc loop 13 for (int idx = 0; idx < m; idx++) 14 a[idx] = sqrtf(fabsf(a[idx])); 15 } 16 17 int main() 18 { 19 float x[ROW][COL]; 20 int row, col; 21 for (row = 0; row < ROW; row++) 22 { 23 for (col = 0; col < COL; col++) 24 x[row][col] = row * 10 + col; 25 } 26 printf(" x[1][1] = %f ", x[1][1]); 27 28 #pragma acc parallel loop vector pcopy(x[0:ROW][0:COL]) // 之后在这里分别添加 gang,worker,vector 29 for (row = 0; row < ROW; row++) 30 sqab(&x[row][0], COL); 31 printf(" x[1][1] = %f ", x[1][1]); 32 33 //getchar(); 34 return 0; 35 }
● 输出结果,第 28 行不添加并行级别子句(默认使用 gang)
1 D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc main.c -acc -Minfo -o main_acc.exe 2 sqab: 3 11, Generating Tesla code 4 13, #pragma acc loop vector /* threadIdx.x */ 5 13, Loop is parallelizable 6 main: 7 28, Generating copy(x[:][:]) 8 Accelerator kernel generated 9 Generating Tesla code 10 29, #pragma acc loop gang /* blockIdx.x */ 11 12 D:CodeOpenACCOpenACCProjectOpenACCProject>main_acc.exe 13 14 x[1][1] = 11.000000 15 launch CUDA kernel file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main 16 line=28 device=0 threadid=1 num_gangs=8 num_workers=1 vector_length=32 grid=8 block=32 // 8 个 gang 在 blockIdx.x 层级,1 个 worker,vector 在 threadIdx.x 层级 17 18 x[1][1] = 3.316625 19 PGI: "acc_shutdown" not detected, performance results might be incomplete. 20 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete. 21 22 Accelerator Kernel Timing data 23 D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c 24 main NVIDIA devicenum=0 25 time(us): 9 26 28: compute region reached 1 time 27 28: kernel launched 1 time 28 grid: [8] block: [32] 29 elapsed time(us): total=1000 max=1000 min=1000 avg=1000 30 28: data region reached 2 times 31 28: data copyin transfers: 1 32 device time(us): total=4 max=4 min=4 avg=4 33 31: data copyout transfers: 1 34 device time(us): total=5 max=5 min=5 avg=5
● 输出结果,第 28 行添加并行级别子句 worker
1 D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc main.c -acc -Minfo -o main_acc.exe 2 sqab: 3 11, Generating Tesla code 4 13, #pragma acc loop vector /* threadIdx.x */ 5 13, Loop is parallelizable 6 main: 7 28, Generating copy(x[:][:]) 8 Accelerator kernel generated 9 Generating Tesla code 10 29, #pragma acc loop worker(4) /* threadIdx.y */ 11 29, Loop is parallelizable 12 13 D:CodeOpenACCOpenACCProjectOpenACCProject>main_acc.exe 14 15 x[1][1] = 11.000000 16 launch CUDA kernel file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main 17 line=28 device=0 threadid=1 num_gangs=1 num_workers=4 vector_length=32 grid=1 block=32x4 // 1 个 gang,4 个 worker 在 threadIdx.y 层级,使用 2 维线程网格 18 19 x[1][1] = 3.316625 20 PGI: "acc_shutdown" not detected, performance results might be incomplete. 21 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete. 22 23 Accelerator Kernel Timing data 24 D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c 25 main NVIDIA devicenum=0 26 time(us): 10 27 28: compute region reached 1 time 28 28: kernel launched 1 time 29 grid: [1] block: [32x4] 30 device time(us): total=0 max=0 min=0 avg=0 31 28: data region reached 2 times 32 28: data copyin transfers: 1 33 device time(us): total=5 max=5 min=5 avg=5 34 31: data copyout transfers: 1 35 device time(us): total=5 max=5 min=5 avg=5
● 输出结果,第 28 行添加并行级别子句 vector
1 D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc main.c -acc -Minfo -o main_acc.exe 2 sqab: 3 11, Generating Tesla code 4 13, #pragma acc loop vector /* threadIdx.x */ 5 13, Loop is parallelizable 6 main: 7 28, Generating copy(x[:][:]) 8 Accelerator kernel generated 9 Generating Tesla code 10 29, #pragma acc loop seq 11 29, Loop is parallelizable 12 13 D:CodeOpenACCOpenACCProjectOpenACCProject>main_acc.exe 14 15 x[1][1] = 11.000000 16 launch CUDA kernel file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main 17 line=28 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=32 grid=1 block=32 // 1 个 gang,1 个 worker,并行全都堆在 threadIdx.x 层级上 18 19 x[1][1] = 3.316625 20 PGI: "acc_shutdown" not detected, performance results might be incomplete. 21 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete. 22 23 Accelerator Kernel Timing data 24 D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c 25 main NVIDIA devicenum=0 26 time(us): 10 27 28: compute region reached 1 time 28 28: kernel launched 1 time 29 grid: [1] block: [32] 30 elapsed time(us): total=1000 max=1000 min=1000 avg=1000 31 28: data region reached 2 times 32 28: data copyin transfers: 1 33 device time(us): total=5 max=5 min=5 avg=5 34 31: data copyout transfers: 1 35 device time(us): total=5 max=5 min=5 avg=5
● 如果自定义函数并行子句等级高于主调函数,则主调函数并行子句会变成 seq;如果自定义函数并行子句等级低于内部并行子句等级,则会报 warning,忽略掉内部并行子句:
1 #pragma acc routine vector 2 void sqab(float *a, const int m) 3 { 4 #pragma acc loop worker 5 for (int idx = 0; idx < m; idx++) 6 a[idx] = sqrtf(fabsf(a[idx])); 7 }
● 编译结果(运行结果通上面的 worker,不写)
D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc main.c -acc -Minfo -o main_acc.exe PGC-W-0155-acc loop worker clause ignored in acc routine vector procedure (main.c: 13) sqab: 11, Generating Tesla code 13, #pragma acc loop vector /* threadIdx.x */ 13, Loop is parallelizable