OpenACC Julia 图形

▶ 书上的代码，逐步优化绘制 Julia 图形的代码

● 无并行优化（手动优化了变量等）

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <openacc.h>
 4 
 5 #define N   (1024 * 8)
 6 
 7 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)// 计算单点迭代次数
 8 {
 9     float zre2 = 0.0f, zim2 = 0.0f;
10     for (int iter = 1; iter < maxIter; iter += 2)   // 一个迭代里计算两次
11     {
12         zre2 = zre * zre - zim * zim + cre, zim2 = 2 * zre * zim + cim;
13         if (zre2 * zre2 + zim2 * zim2 > 4.0f)
14             return iter;
15 
16         zre = zre2 * zre2 - zim2 * zim2 + cre, zim = 2 * zre2 * zim2 + cim;
17         if (zre * zre + zim * zim > 4.0f)
18             return iter;
19     }
20     return maxIter + 1 + (maxIter % 2);
21 }
22 
23 int main()
24 {
25     const int maxIter = 128;                                // 最大迭代次数
26     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N; // 迭代常数和画幅步长
27     int *image = (int *)malloc(sizeof(int) * N * N);
28     FILE *pf = fopen("R:/output.txt", "w");
29 
30     for (int i = 0; i < N; i++)
31     {
32         for (int j = 0; j < N; j++)
33             fprintf(pf, "%d ", julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter));
34         fprintf(pf, "
");
35     }
36 
37     fclose(pf);
38     free(image);
39     //getchar();
40     return 0;
41 }

● 输出结果（后面所有代码的输出都相同，不再写了）

● 改进 1，计算并行化

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <openacc.h>
 4 
 5 #define N   (1024 * 8)
 6 
 7 #pragma acc routine seq
 8 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)
 9 {
10     float zre2 = 0.0f, zim2 = 0.0f;
11     for (int iter = 1; iter < maxIter; iter += 2)
12     {
13         zre2 = zre * zre - zim * zim + cre, zim2 = 2 * zre * zim + cim;
14         if (zre2 * zre2 + zim2 * zim2 > 4.0f)
15             return iter;
16 
17         zre = zre2 * zre2 - zim2 * zim2 + cre, zim = 2 * zre2 * zim2 + cim;
18         if (zre * zre + zim * zim > 4.0f)
19             return iter;
20     }
21     return maxIter + 1 + (maxIter % 2);
22 }
23 
24 int main()
25 {
26     const int maxIter = 128;
27     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N;
28     int *image = (int *)malloc(sizeof(int) * N * N);    
29 
30 #pragma acc data copyout(image[0:N * N])    // 数据域
31     {
32 #pragma acc kernels loop independent        // loop 并行化，强制独立
33         for (int i = 0; i < N; i++)
34         {
35             for (int j = 0; j < N; j++)
36                 image[i * N + j] = julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter);
37         }
38     }
39     /*// 注释掉写入文件的部分，防止 Nvvp 加入分析
40     FILE *pf = fopen("R:/output.txt", "w");
41     for (int i = 0; i < N; i++)
42     {
43         for (int j = 0; j < N; j++)
44             fprintf(pf, "%d ", image[i * N + j]);
45         fprintf(pf, "
");
46     }
47     fclose(pf);
48     */    
49     free(image);
50     //getchar();
51     return 0;
52 }

● 输出结果

D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe
julia:
      9, Generating acc routine seq
         Generating Tesla code
     11, FMA (fused multiply-add) instruction(s) generated
     17, FMA (fused multiply-add) instruction(s) generated
main:
     30, Generating copyout(image[:67108864])
     33, Loop is parallelizable
         FMA (fused multiply-add) instruction(s) generated
     35, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         33, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         35, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     35, FMA (fused multiply-add) instruction(s) generated

D:CodeOpenACCOpenACCProjectOpenACCProject>main_acc.exe
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=35 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
PGI: "acc_shutdown" not detected, performance results might be incomplete.
 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data
D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c
  main  NVIDIA  devicenum=0
    time(us): 20,486
    30: data region reached 2 times
        49: data copyout transfers: 17
             device time(us): total=20,486 max=1,288 min=5 avg=1,205
    32: compute region reached 1 time
        35: kernel launched 1 time
            grid: [256x128]  block: [32x4]
             device time(us): total=0 max=0 min=0 avg=0

● 改进 2，分块计算，没有明显性能提升，为异步做准备

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <openacc.h>
 4 
 5 #define N   (1024 * 8)
 6 
 7 #pragma acc routine seq
 8 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)
 9 {
10     float zre2 = 0.0f, zim2 = 0.0f;
11     for (int iter = 1; iter < maxIter; iter += 2)
12     {
13         zre2 = zre * zre - zim * zim + cre, zim2 = 2 * zre * zim + cim;
14         if (zre2 * zre2 + zim2 * zim2 > 4.0f)
15             return iter;
16 
17         zre = zre2 * zre2 - zim2 * zim2 + cre, zim = 2 * zre2 * zim2 + cim;
18         if (zre * zre + zim * zim > 4.0f)
19             return iter;
20     }
21     return maxIter + 1 + (maxIter % 2);
22 }
23 
24 int main()
25 {
26     const int maxIter = 128;
27     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N;
28     int *image = (int *)malloc(sizeof(int) * N * N);   
29 
30 #pragma acc data copyout(image[0:N * N])
31     {
32         const int numblock = 4;                         // 指定分块数量
33         for (int block = 0; block < numblock; block++)  // 每次计算一块
34         {
35             const int start = block * (N / numblock), end = start + N / numblock;   // 每块的始末下标
36 #pragma acc kernels loop independent
37             for (int i = start; i < end; i++)
38             {
39                 for (int j = 0; j < N; j++)
40                     image[i * N + j] = julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter);
41             }
42         }
43     }
44     /*
45     FILE *pf = fopen("R:/output.txt", "w");
46     for (int i = 0; i < N; i++)
47     {
48         for (int j = 0; j < N; j++)
49             fprintf(pf, "%d ", image[i * N + j]);
50         fprintf(pf, "
");
51     }
52     fclose(pf);
53     */    
54     free(image);
55     //getchar();
56     return 0;
57 }

● 输出结果

D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe
julia:
      9, Generating acc routine seq
         Generating Tesla code
     11, FMA (fused multiply-add) instruction(s) generated
     17, FMA (fused multiply-add) instruction(s) generated
main:
     30, Generating copyout(image[:67108864])
     37, Loop is parallelizable
         FMA (fused multiply-add) instruction(s) generated
     39, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         37, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         39, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     39, FMA (fused multiply-add) instruction(s) generated

D:CodeOpenACCOpenACCProjectOpenACCProject>main_acc.exe
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=39 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main 
=39 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=39 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=39 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
PGI: "acc_shutdown" not detected, performance results might be incomplete.
 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data
D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c
  main  NVIDIA  devicenum=0
    time(us): 20,456
    30: data region reached 2 times
        54: data copyout transfers: 17
             device time(us): total=20,456 max=1,297 min=11 avg=1,203
    36: compute region reached 4 times
        39: kernel launched 4 times
            grid: [256x128]  block: [32x4]
             device time(us): total=0 max=0 min=0 avg=0

● 改进 3，分块传输，没有明显性能提升，为异步做准备

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <openacc.h>
 4 
 5 #define N   (1024 * 8)
 6 
 7 #pragma acc routine seq
 8 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)
 9 {
10     float zre2 = 0.0f, zim2 = 0.0f;
11     for (int iter = 1; iter < maxIter; iter += 2)
12     {
13         zre2 = zre * zre - zim * zim + cre, zim2 = 2 * zre * zim + cim;
14         if (zre2 * zre2 + zim2 * zim2 > 4.0f)
15             return iter;
16 
17         zre = zre2 * zre2 - zim2 * zim2 + cre, zim = 2 * zre2 * zim2 + cim;
18         if (zre * zre + zim * zim > 4.0f)
19             return iter;
20     }
21     return maxIter + 1 + (maxIter % 2);
22 }
23 
24 int main()
25 {
26     const int maxIter = 128;
27     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N;
28     int *image = (int *)malloc(sizeof(int) * N * N);
29     
30 #pragma acc data create(image[0:N * N])                         // 改成 create，不需要从主机拷贝初始数据
31     {
32         const int numBlock = 4, blockSize = N * N / numBlock;   // 仍然分块计算
33         for (int block = 0; block < numBlock; block++)
34         {
35             const int start = block * (N / numBlock), end = start + N / numBlock;
36 #pragma acc kernels loop independent
37             for (int i = start; i < end; i++)
38             {
39                 for (int j = 0; j < N; j++)
40                     image[i * N + j] = julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter);
41             }
42 #pragma acc update host(image[block * blockSize : blockSize])   // 每计算完一块就向主机回传数据
43         }
44     }
45     /*
46     FILE *pf = fopen("R:/output.txt", "w");
47     for (int i = 0; i < N; i++)
48     {
49         for (int j = 0; j < N; j++)
50             fprintf(pf, "%d ", image[i * N + j]);
51         fprintf(pf, "
");
52     }
53     fclose(pf);
54     */    
55     free(image);
56     //getchar();
57     return 0;
58 }

● 输出结果

D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe
julia:
      9, Generating acc routine seq
         Generating Tesla code
     11, FMA (fused multiply-add) instruction(s) generated
     17, FMA (fused multiply-add) instruction(s) generated
main:
     30, Generating create(image[:67108864])
     37, Loop is parallelizable
         FMA (fused multiply-add) instruction(s) generated
     39, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         37, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         39, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     39, FMA (fused multiply-add) instruction(s) generated
     43, Generating update self(image[block*blockSize:blockSize])

D:CodeOpenACCOpenACCProjectOpenACCProject>main_acc.exe
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=39 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=39 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=39 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=39 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
PGI: "acc_shutdown" not detected, performance results might be incomplete.
 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data
D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c
  main  NVIDIA  devicenum=0
    time(us): 20,474
    30: data region reached 2 times
    36: compute region reached 4 times
        39: kernel launched 4 times
            grid: [256x128]  block: [32x4]
            elapsed time(us): total=16,000 max=16,000 min=0 avg=4,000
    43: update directive reached 4 times
        43: data copyout transfers: 20
             device time(us): total=20,474 max=1,287 min=5 avg=1,023

● 改进 4，异步计算 - 双向传输

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <openacc.h>
 4 
 5 #define N   (1024 * 8)
 6 
 7 #pragma acc routine seq
 8 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)
 9 {
10     float zre2 = 0.0f, zim2 = 0.0f;
11     for (int iter = 1; iter < maxIter; iter += 2)
12     {
13         zre2 = zre * zre - zim * zim + cre, zim2 = 2 * zre * zim + cim;
14         if (zre2 * zre2 + zim2 * zim2 > 4.0f)
15             return iter;
16 
17         zre = zre2 * zre2 - zim2 * zim2 + cre, zim = 2 * zre2 * zim2 + cim;
18         if (zre * zre + zim * zim > 4.0f)
19             return iter;
20     }
21     return maxIter + 1 + (maxIter % 2);
22 }
23 
24 int main()
25 {
26     const int maxIter = 128;
27     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N;
28     int *image = (int *)malloc(sizeof(int) * N * N);    
29 
30 #pragma acc data create(image[0:N * N])
31     {
32         const int numBlock = 4, blockSize = N / numBlock * N;
33         for (int block = 0; block < numBlock; block++)
34         {
35             const int start = block * (N / numBlock), end = start + N / numBlock;
36 #pragma acc kernels loop independent async(block + 1)                           // 异步计算，用块编号作标记
37             for (int i = start; i < end; i++)
38             {
39                 for (int j = 0; j < N; j++)
40                     image[i * N + j] = julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter);
41             }
42 #pragma acc update host(image[block * blockSize : blockSize]) async(block + 1)  // 计算完一块就异步传输
43         }
44 #pragma acc wait
45     }
46     /*
47     FILE *pf = fopen("R:/output.txt", "w");
48     for (int i = 0; i < N; i++)
49     {
50         for (int j = 0; j < N; j++)
51             fprintf(pf, "%d ", image[i * N + j]);
52         fprintf(pf, "
");
53     }
54     fclose(pf);
55     */
56     free(image);
57     //getchar();
58     return 0;
59 }

● 输出结果

D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe
julia:
      9, Generating acc routine seq
         Generating Tesla code
     11, FMA (fused multiply-add) instruction(s) generated
     17, FMA (fused multiply-add) instruction(s) generated
main:
     30, Generating create(image[:67108864])
     37, Loop is parallelizable
         FMA (fused multiply-add) instruction(s) generated
     39, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         37, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         39, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     39, FMA (fused multiply-add) instruction(s) generated
     43, Generating update self(image[block*blockSize:blockSize])

D:CodeOpenACCOpenACCProjectOpenACCProject>main_acc.exe
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=39 device=0 threadid=1 queue=1 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=39 device=0 threadid=1 queue=2 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=39 device=0 threadid=1 queue=3 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=39 device=0 threadid=1 queue=4 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
PGI: "acc_shutdown" not detected, performance results might be incomplete.
 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data
    Timing may be affected by asynchronous behavior
    set PGI_ACC_SYNCHRONOUS to 1 to disable async() clauses
D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c
  main  NVIDIA  devicenum=0
    time(us): 20,503
    30: data region reached 2 times
    36: compute region reached 4 times
        39: kernel launched 4 times
            grid: [256x128]  block: [32x4]
             device time(us): total=0 max=0 min=0 avg=0
    43: update directive reached 4 times
        43: data copyout transfers: 20
             device time(us): total=20,503 max=1,297 min=5 avg=1,025

● 使用统一内存访址（Ubuntu，win64 不支持）

● 改进 4，多设备版本 1，使用 OpenMP

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <omp.h>
 4 #include <openacc.h>
 5 
 6 #define N   (1024 * 8)
 7 
 8 #pragma acc routine seq
 9 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)
10 {
11     float zre2 = 0.0f, zim2 = 0.0f;
12     for (int iter = 1; iter < maxIter; iter += 2)
13     {
14         zre2 = zre * zre - zim * zim + cre, zim2 = 2 * zre * zim + cim;
15         if (zre2 * zre2 + zim2 * zim2 > 4.0f)
16             return iter;
17 
18         zre = zre2 * zre2 - zim2 * zim2 + cre, zim = 2 * zre2 * zim2 + cim;
19         if (zre * zre + zim * zim > 4.0f)
20             return iter;
21     }
22     return maxIter + 1 + (maxIter % 2);
23 }
24 
25 int main()
26 {
27     const int maxIter = 128;
28     const int numBlock = acc_get_num_devices(acc_device_nvidia), blockSize = N / numBlock * N;  // 使用 OpenMP 检测目标设备数量，以此作为分块数
29     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N;
30 
31     int *image = (int *)malloc(sizeof(int) * N * N);
32     acc_init(acc_device_nvidia);                                    // 一次性初始化全部目标设备
33 
34 #pragma omp parallel num_threads(numBlock)                          // 使用多个线程，分别向目标设备发送任务
35     {
36         acc_set_device_num(omp_get_thread_num(), acc_device_nvidia);// 标记目标设备
37 #pragma omp for
38         for (int block = 0; block < numBlock; block++)
39         {
40             const int start = block * (N / numBlock), end = start + N / numBlock;
41 #pragma acc data copyout(image[block * blockSize : blockSize])
42             {
43 #pragma acc kernels loop independent
44                 for (int i = start; i < end; i++)
45                 {
46                     for (int j = 0; j < N; j++)
47                         image[i * N + j] = julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter);
48                 }
49             }
50         }
51     }
52     /*
53     FILE *pf = fopen("R:/output.txt", "w");
54     for (int i = 0; i < N; i++)
55     {
56         for (int j = 0; j < N; j++)
57             fprintf(pf, "%d ", image[i * N + j]);
58         fprintf(pf, "
");
59     }
60     fclose(pf);
61     */
62     free(image);
63     //getchar();
64     return 0;
65 }

● 输出结果

D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc -acc -mp -Minfo main.c -o main_acc.exe
julia:
     10, Generating acc routine seq
         Generating Tesla code
     12, FMA (fused multiply-add) instruction(s) generated
     18, FMA (fused multiply-add) instruction(s) generated
main:
     35, Parallel region activated
     38, Parallel loop activated with static block schedule
     41, Generating copyout(image[block*blockSize:blockSize])
     44, Loop is parallelizable
         FMA (fused multiply-add) instruction(s) generated
     46, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         44, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         46, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     46, FMA (fused multiply-add) instruction(s) generated
     51, Barrier
     62, Parallel region terminated

D:CodeOpenACCOpenACCProjectOpenACCProject>main_acc.exe
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=46 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
PGI: "acc_shutdown" not detected, performance results might be incomplete.
 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data
D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c
  main  NVIDIA  devicenum=0
    time(us): 20,462
    41: data region reached 2 times
        50: data copyout transfers: 17
             device time(us): total=20,462 max=1,297 min=5 avg=1,203
    43: compute region reached 1 time
        46: kernel launched 1 time
            grid: [256x128]  block: [32x4]
             device time(us): total=0 max=0 min=0 avg=0

● 改进 5，多设备版本 2，调整 OpenMP

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <omp.h>
 4 #include <openacc.h>
 5 
 6 #define N   (1024 * 8)
 7 
 8 #pragma acc routine seq
 9 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)
10 {
11     float zre2 = 0.0f, zim2 = 0.0f;
12     for (int iter = 1; iter < maxIter; iter += 2)
13     {
14         zre2 = zre * zre - zim * zim + cre, zim2 = 2 * zre * zim + cim;
15         if (zre2 * zre2 + zim2 * zim2 > 4.0f)
16             return iter;
17 
18         zre = zre2 * zre2 - zim2 * zim2 + cre, zim = 2 * zre2 * zim2 + cim;
19         if (zre * zre + zim * zim > 4.0f)
20             return iter;
21     }
22     return maxIter + 1 + (maxIter % 2);
23 }
24 
25 int main()
26 {
27     const int maxIter = 128;
28     const int numBlock = acc_get_num_devices(acc_device_nvidia), blockSize = N / numBlock * N;
29     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N;   
30 
31     int *image = (int *)malloc(sizeof(int) * N * N);
32     acc_init(acc_device_nvidia);
33 
34 #pragma omp parallel for num_threads(numBlock)          // 把函数 acc_set_device_num 单独放在一起
35     for(int block = 0;block<numBlock;block++)
36         acc_set_device_num(block, acc_device_nvidia);
37 
38 #pragma omp for num_threads(numBlock)
39     for (int block = 0; block < numBlock; block++)
40     {
41         const int start = block * (N / numBlock), end = start + N / numBlock;
42 #pragma acc data copyout(image[block * blockSize : blockSize])
43         {
44 #pragma acc kernels loop independent
45             for (int i = start; i < end; i++)
46             {
47                 for (int j = 0; j < N; j++)
48                     image[i * N + j] = julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter);
49             }
50         }
51     }
52     /*
53     FILE *pf = fopen("R:/output.txt", "w");
54     for (int i = 0; i < N; i++)
55     {
56         for (int j = 0; j < N; j++)
57             fprintf(pf, "%d ", image[i * N + j]);
58         fprintf(pf, "
");
59     }    
60     fclose(pf);
61     */
62     free(image);
63     //getchar();
64     return 0;
65 }

● 输出结果

D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc main.c -acc -Minfo -o main_acc.exe
julia:
     10, Generating acc routine seq
         Generating Tesla code
     12, FMA (fused multiply-add) instruction(s) generated
     18, FMA (fused multiply-add) instruction(s) generated
main:
     42, Generating copyout(image[block*blockSize:blockSize])
     45, Loop is parallelizable
         FMA (fused multiply-add) instruction(s) generated
     47, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         45, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         47, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     47, FMA (fused multiply-add) instruction(s) generated

D:CodeOpenACCOpenACCProjectOpenACCProject>main_acc.exe
launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
line=47 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=256x128 block=32x4
PGI: "acc_shutdown" not detected, performance results might be incomplete.
 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data
D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c
  main  NVIDIA  devicenum=0
    time(us): 20,571
    42: data region reached 2 times
        51: data copyout transfers: 17
             device time(us): total=20,571 max=1,336 min=11 avg=1,210
    44: compute region reached 1 time
        47: kernel launched 1 time
            grid: [256x128]  block: [32x4]
            elapsed time(us): total=3,000 max=3,000 min=3,000 avg=3,000

相关阅读:
在Oracle中计算两个日期间隔的天数、月数和年数
 洛谷P1182 数列分段 Section II（二分+贪心）
BZOJ1734 [Usaco2005 feb]Aggressive cows 愤怒的牛（二分答案+贪心）
分治算法
 洛谷P1031 [NOIP2002]均分纸牌
 洛谷P1803 凌乱的yyy / 线段覆盖
 洛谷P1094 [NOIP2007]纪念品分组
 洛谷P1223 排队接水
 洛谷P1208 [USACO1.3]混合牛奶 Mixing Milk
洛谷P1181 数列分段Section1
原文地址：https://www.cnblogs.com/cuancuancuanhao/p/9438406.html