• OpenACC 书上的范例代码(Jacobi 迭代),part 4


    ▶ 使用Jacobi 迭代求泊松方程的数值解

    ● 使用 routine 导语封装模平方函数,把 u1 放进设备端再次减少拷贝开销,数组初始化也在设备中完成

     1 #include <stdio.h>
     2 #include <stdlib.h>
     3 #include <math.h>
     4 #include <time.h>
     5 #include <openacc.h>
     6 
     7 #if defined(_WIN32) || defined(_WIN64)
     8 #include <C:Program FilesPGIwin6419.4includewrapsys	imeb.h>    
     9 #define timestruct clock_t
    10 #define gettime(a) (*(a) = clock())
    11 #define usec(t1,t2) (t2 - t1)
    12 #else
    13 #include <sys/time.h>
    14 #define gettime(a)  gettimeofday(a, NULL)
    15 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec)   
    16 typedef struct timeval timestruct;
    17 #endif
    18 
    19 #pragma acc routine seq
    20 inline float uval(float x, float y)
    21 {
    22     return x * x + y * y;
    23 }
    24 
    25 int main()
    26 {
    27     const int row = 8191, col = 1023;
    28     const float height = 1.0, width = 2.0;
    29     const float hx = height / row, wy = width / col;
    30     const float fij = -4.0f;
    31     const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2));
    32     const int maxIter = 100;
    33     const int colPlus = col + 1;
    34 
    35     float *restrict u0 = (float *)malloc(sizeof(float)*(row + 1)*colPlus);
    36     float *restrict u1 = (float *)malloc(sizeof(float)*(row + 1)*colPlus);
    37     float *utemp = NULL;
    38 
    39     timestruct t1, t2;
    40     acc_init(acc_device_nvidia);
    41     gettime(&t1);
    42 #pragma acc data copyout(u0[0:(row + 1) * colPlus]) create(u1[0:(row + 1) * colPlus])
    43     {
    44 #pragma acc kernels present(u0[0:((row + 1) * colPlus)])
    45         {
    46 #pragma acc loop independent 
    47             for (int i = 0; i < (row + 1)*(col + 1); i++)
    48                 u0[i] = 0.0f;
    49 #pragma acc loop independent
    50             for (int ix = 0; ix <= row; ix++)
    51                 u0[ix * colPlus + 0] = uval(ix * hx, 0.0f);
    52 #pragma acc loop independent
    53             for (int ix = 0; ix <= row; ix++)
    54                 u0[ix * colPlus + col] = uval(ix * hx, col * wy);
    55 #pragma acc loop independent
    56             for (int jy = 0; jy <= col; jy++)
    57                 u0[jy] = uval(0.0f, jy * wy);
    58 #pragma acc loop independent
    59             for (int jy = 0; jy <= col; jy++)
    60                 u0[row * colPlus + jy] = u1[row * colPlus + jy] = uval(row * hx, jy * wy);
    61         }
    62         
    63         for (int iter = 0; iter < maxIter; iter++)
    64         {
    65 #pragma acc kernels present(utemp, u0[0:((row + 1) * colPlus)], u1[0:((row + 1) * colPlus)])
    66             {
    67 #pragma acc loop independent
    68                 for (int ix = 1; ix < row; ix++)
    69                 {
    70 #pragma acc loop independent
    71                     for (int jy = 1; jy < col; jy++)
    72                     {
    73                         u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - 1)*colPlus + jy] + u0[(ix + 1)*colPlus + jy]) + 
    74                             hx2 * (u0[ix*colPlus + jy - 1] + u0[ix*colPlus + jy + 1])) * c2;
    75                     }
    76                 }
    77             }
    78             utemp = u0, u0 = u1, u1 = utemp;
    79 #pragma acc wait
    80         }                
    81     }
    82     gettime(&t2);
    83 
    84     long long timeElapse = usec(t1, t2);
    85 #if defined(_WIN32) || defined(_WIN64)
    86     printf("
    Elapsed time: %13ld ms.
    ", timeElapse);
    87 #else    
    88     printf("
    Elapsed time: %13ld us.
    ", timeElapse);
    89 #endif
    90     free(u0);
    91     free(u1);
    92     acc_shutdown(acc_device_nvidia);
    93     //getchar();
    94     return 0;
    95 }

    ● 输出结果,win10 中nvvp显示总时间减少到了 247 ms,计算时间 63 ms 几乎不变

    D:CodeOpenACC>pgcc main.c -o main.exe -c99 -Minfo -acc
    main:
         42, Generating copyout(u0[:colPlus*(row+1)])
             Generating create(u1[:colPlus*(row+1)])
         44, Generating present(u0[:colPlus*(row+1)])
         47, Loop is parallelizable
             Generating Tesla code
             47, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
         47, Memory zero idiom, loop replaced by call to __c_mzero4
         50, Loop is parallelizable
             Generating Tesla code
             50, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
         53, Loop is parallelizable
             Generating Tesla code
             53, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
         56, Loop is parallelizable
             Generating Tesla code
             56, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
         59, Loop is parallelizable
             Generating Tesla code
             59, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
         63, Generating present(utemp[:],u1[:colPlus*(row+1)],u0[:colPlus*(row+1)])
         68, Loop is parallelizable
         71, Loop is parallelizable
             Generating Tesla code
             68, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
             71, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
         71, FMA (fused multiply-add) instruction(s) generated
    uval:
         21, Generating acc routine seq
             Generating Tesla code
         22, FMA (fused multiply-add) instruction(s) generated
    
    D:CodeOpenACC>main.exe
    
    Elapsed time:            63 ms.

    ● 输出结果,Ubuntu 中时间也减少了少许

    ...
    
    Elapsed time:         63274 us.
    
    Accelerator Kernel Timing data
    /home/cuan/my.c
      main  NVIDIA  devicenum=0
        time(us): 36,680
        42: data region reached 2 times
            82: data copyout transfers: 3
                 device time(us): total=2,587 max=1,299 min=10 avg=862
        44: compute region reached 1 time
            47: kernel launched 1 time
                grid: [65535]  block: [128]
                 device time(us): total=155 max=155 min=155 avg=155
                elapsed time(us): total=317 max=317 min=317 avg=317
            50: kernel launched 1 time
                grid: [64]  block: [128]
                 device time(us): total=5 max=5 min=5 avg=5
                elapsed time(us): total=17 max=17 min=17 avg=17
            53: kernel launched 1 time
                grid: [64]  block: [128]
                 device time(us): total=5 max=5 min=5 avg=5
                elapsed time(us): total=16 max=16 min=16 avg=16
            56: kernel launched 1 time
                grid: [8]  block: [128]
                 device time(us): total=1 max=1 min=1 avg=1
                elapsed time(us): total=12 max=12 min=12 avg=12
            59: kernel launched 1 time
                grid: [8]  block: [128]
                 device time(us): total=2 max=2 min=2 avg=2
                elapsed time(us): total=13 max=13 min=13 avg=13
        44: data region reached 2 times
        63: data region reached 200 times
        65: compute region reached 100 times
            71: kernel launched 100 times
                grid: [32x1024]  block: [32x4]
                 device time(us): total=33,925 max=358 min=336 avg=339
                elapsed time(us): total=36,792 max=872 min=347 avg=367
  • 相关阅读:
    input 放大镜
    记住密码弹出事件
    thinkphp修改及编写标签库,编辑器的使用
    thinkphp 配合mongodb
    缓存技术
    php面试题目
    pdo 整套类的封装,保存修改查询
    mongodb 的备份恢复导入与导出
    mongodb 分组查询
    smarty 模板的入门使用
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/11033195.html
Copyright © 2020-2023  润新知