• OpenACC 数据管理语句


    ▶ 书中第4章,数据管理部分的代码和说明

    ● 代码,关于 copy,copyin,copyout,create

     1 #include <stdio.h>
     2 #include <openacc.h>
     3 
     4 int main()
     5 {
     6     const int length = 1024;
     7     int a[length], b[length], c[length],d[length];
     8 
     9     for (int i = 0; i < length; a[i] = b[i] = c[i] = 1);
    10     {
    11 #pragma acc kernels create(d)
    12         for (int i = 0; i < length; i++)
    13         {
    14             a[i] ++;
    15             c[i] = a[i] + b[i];
    16             d[i] = 0;
    17         }
    18     }
    19     for (int i = 0; i < 10; i++)
    20         printf("a[%d] = %d, c[%d] = %d
    ", i, a[i], i, c[i]);
    21     getchar();
    22     return 0;
    23 }

    ● 输出结果,显式创建了中间变量 d,隐式创建了 a,b,c,并具有不同的拷贝属性

    D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe
    main:
         11, Generating create(d[:])
             Generating implicit copyout(c[:1024])
             Generating implicit copyin(b[:1024])
             Generating implicit copy(a[:1024])
         12, Loop is parallelizable
             Accelerator kernel generated
             Generating Tesla code
             12, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */

    ● 在 kernels 里单独使用 copyout 时报警告:PGC-W-0996-The directive #pragma acc copyout is deprecated; use #pragma acc declare copyout instead (main.c: XX)

    ● enter data 和 exit data 用于 C++。

    ■ 首先,windows 中 pgi 不支持 C++ 编译,只有 pgcc.exe 而没有 pgc++*.exe,只能乖乖到 Linux 下去写!

    ■ 书上的代码有点问题,大意是 OpenACC 的 copy 是浅拷贝,对于内含指针的数据结构(如 vector,class)不会连着指针指向的对象一起拷。这里有两种解决办法,一种是去结构化,将 class 中的数据集中成简单数组来进行拷贝;另一种是使用 Managed 内存,也就不存在显式拷贝的问题了。【https://stackoverflow.com/questions/53860467/how-to-copy-on-gpu-a-vector-of-vector-pointer-memory-allocated-in-openacc】

    ■ 书上的代码没有采用这两种解决方案,会报错 “call to cuStreamSynchronize returned error 700: Illegal address during kernel execution” 以及 “call to cuStreamSynchronize returned error 700: Illegal address during kernel execution”,这个问题还蛮常见的【https://stackoverflow.com/search?q=call+to+returned+error+700%3A+Illegal+address+during+kernel+execution】

    ● 使用去结构化来使用数组

     1 #include <iostream>
     2 #include <vector>
     3 #include <cstdint>
     4 
     5 using namespace std;
     6 
     7 int main()
     8 {
     9     const int vectorCount = 1024, vectorLength = 20;
    10     long sum = 0;
    11 
    12     vector<int32_t> *vectorTable = new vector<int32_t>[vectorCount]; // 1024 个向量,每个向量放入 20 个元素
    13     for (int i = 0; i < vectorCount; i++)
    14     {
    15         for (int j = 0; j < vectorLength; j++)
    16             vectorTable[i].push_back(i);
    17     }
    18     int32_t **arrayTable = new int32_t *[vectorCount]; // 仅包含向量数据的数组,与 vectorTable 对应
    19     int *vectorSize = new int[vectorCount];            // 每个向量的尺寸
    20 
    21 #pragma acc enter data create(arrayTable [0:vectorCount] [0:0]) // 设备中创建 arryTable,注意维度
    22     for (int i = 0; i < vectorCount; i++)
    23     {
    24         int sze = vectorTable[i].size();
    25         vectorSize[i] = sze;
    26         arrayTable[i] = vectorTable[i].data();        // 把每个向量数据的指针赋给 arrayTable
    27 #pragma acc enter data copyin(arrayTable [i:1][:sze]) // 把每个向量的数据拷贝进设备
    28     }
    29 #pragma acc enter data copyin(vectorSize[:vectorCount]) // 向量尺寸也放进设备
    30 
    31 #pragma acc parallel loop gang vector reduction(+: sum) present(arrayTable, vectorSize) // 规约计算
    32     for (int i = 0; i < vectorCount; i++)
    33     {
    34         for (int j = 0; j < vectorSize[i]; ++j)
    35             sum += arrayTable[i][j];
    36     }
    37     cout << "Sum: " << sum << endl;
    38 
    39 #pragma acc exit data delete (vectorSize)
    40 #pragma acc exit data delete (arrayTable)
    41     delete[] vectorSize;
    42     delete[] vectorTable;
    43     return 0;
    44 }

    ● 输出结果

    cuan@CUAN:~$ pgc++ main.cpp -o main.exe --c++11 -ta=tesla -Minfo -acc
    main:
         95, Generating enter data create(arrayTable[:1024][:0])
         32, Generating enter data copyin(arrayTable[i][:sze+1],vectorSize[:1024])
             Generating implicit copy(sum)
             Generating present(vectorSize[:])
             Generating Tesla code
             32, Generating reduction(+:sum)
             32, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             34, #pragma acc loop seq
         32, Generating present(arrayTable[:][:])
         34, Loop is parallelizable
         41, Generating exit data delete(vectorSize[:1],arrayTable[:1][:1])
    cuan@CUAN:~$ ./main.exe
    Sum: 10475520

    ● 使用 Managed 内存

     1 #include <iostream>
     2 
     3 using namespace std;
     4 
     5 class ivector
     6 {
     7 public:
     8     int len;
     9     int *arr;
    10     ivector(int length)
    11     {
    12         len = length;
    13         arr = new int[len];
    14 #pragma acc enter data copyin(this)
    15 #pragma acc enter data create(arr [0:len])
    16 #pragma acc parallel loop present(arr [0:len])
    17         for (int iend = len, i = 0; i < iend; i++)              // 使用临时变量 iend,防止编译器认为 len 值在循环中会改变,从而拒绝并行化
    18             arr[i] = i;
    19     }
    20 
    21     ivector(const ivector &s)
    22     {
    23         len = s.len;
    24         arr = new int[len];
    25 #pragma acc enter data copyin(this)
    26 #pragma acc enter data create(arr [0:len])
    27 #pragma acc parallel loop present(arr [0:len], s.arr [0:len])   // s 也已经在设备上了
    28         for (int iend = len, i = 0; i < iend; i++)
    29             arr[i] = s.arr[i];
    30     }
    31 
    32     ~ivector()
    33     {
    34 #pragma acc exit data delete (arr)                              // 销毁对象时依次销毁设备上的 arr 和 this
    35 #pragma acc exit data delete (this)
    36         cout << "deconstruction!" << endl;
    37         delete[] arr;
    38         len = 0;
    39     }
    40 
    41     int &operator[](int i)
    42     {
    43         if (i < 0 || i >= this->len)
    44             return arr[0];
    45         return arr[i];
    46     }
    47 
    48     void add(int c)
    49     {
    50 #pragma acc kernels loop present(arr [0:len])                   // 每次涉及修改 arr 的操作都要注明 present
    51         for (int iend = len, i = 0; i < iend; i++)
    52             arr[i] += c;
    53     }
    54 
    55     void updateHost()                                           // 手动更新主机端数据
    56     {
    57 #pragma acc update host(arr [0:len])
    58     }
    59 };
    60 
    61 int main()
    62 {
    63     ivector s1(20);
    64     s1.add(10);
    65     s1.updateHost();
    66     cout << "s1[1] = " << s1[1] << endl;
    67 
    68     ivector s2(s1);
    69     s2.updateHost();
    70     cout << "s2[1] = " << s2[1] << endl;
    71 
    72     return 0;
    73 }

    ● 输出结果,不加 -ta=tesla:managed 会报错【填坑】

    cuan@CUAN:~$ pgc++ main.cpp -o main.exe --c++11 -ta=tesla:managed -Minfo -acc
    ivector::ivector(int):
         13, Generating enter data copyin(this[:1])
             Generating enter data create(arr[:len])
             Generating Tesla code
             17, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
         13, Generating implicit copy(this[:])
             Generating present(arr[:len])
    ivector::ivector(const ivector&):
         24, Generating enter data create(arr[:len])
             Generating enter data copyin(this[:1])
             Generating present(arr[:len])
             Generating Tesla code
             28, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
         24, Generating implicit copyin(s[:])
             Generating implicit copy(this[:])
             Generating present(s->arr[:len])
    ivector::~ivector():
         36, Generating exit data delete(this[:1],arr[:1])
    ivector::add(int):
          0, Generating Tesla code
         49, Accelerator serial kernel generated
             Generating implicit copy(this[:])
             Generating present(arr[:len])
         51, Loop is parallelizable
             Generating Tesla code
             51, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    ivector::updateHost():
         58, Generating update self(arr[:len])
    cuan@CUAN:~$ ./main.exe
    launch CUDA kernel  file=/home/cuan/main.cpp function=_ZN7ivectorC1Ei line=13 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128
    launch CUDA kernel  file=/home/cuan/main.cpp function=_ZN7ivector3addEi line=49 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=1 grid=1 block=1
    launch CUDA kernel  file=/home/cuan/main.cpp function=_ZN7ivector3addEi line=51 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128
    s1[1] = 11
    launch CUDA kernel  file=/home/cuan/main.cpp function=_ZN7ivectorC1ERKS_ line=24 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128
    s2[1] = 11
    deconstruction!
    deconstruction!

     ● 在这本书上找到了 C++ 中使用 OpenACC 的办法 【https://www.elsevier.com/books/parallel-programming-with-openacc/farber/978-0-12-410397-9】,代码是【https://github.com/rmfarber/ParallelProgrammingWithOpenACC/tree/master/Chapter05】下的 accList.double.cpp

      1 // accList.h
      2 #ifndef ACC_LIST_H
      3 #define ACC_LIST_H
      4 
      5 #include <cstdlib>
      6 #include <cassert>
      7 #ifdef _OPENACC
      8 #include <openacc.h>
      9 #endif
     10 
     11 template<typename T>
     12 class accList
     13 {
     14 public:
     15     explicit accList() {}
     16     explicit accList(size_t size)           // 构造函数把 this 指针拷进设备,然后创建内存
     17     {
     18 #pragma acc enter data copyin(this)         
     19         allocate(size);
     20     }
     21 
     22     ~accList()                              // 析构时释放内存,再删除 this 指针
     23     {
     24         release();
     25 #pragma acc exit data delete(this)          
     26     }
     27 
     28 #pragma acc routine seq
     29     T& operator[](size_t idx)
     30     {
     31         return _A[idx];
     32     }
     33 
     34 #pragma acc routine seq
     35     const T& operator[](size_t idx) const
     36     {
     37         return _A[idx];
     38     }
     39 
     40     size_t size() const
     41     {
     42         return _size;
     43     }
     44 
     45     accList& operator=(const accList& B)
     46     {
     47         allocate(B.size());
     48         for (size_t j = 0; j < _size; ++j)
     49         {
     50             _A[j] = B[j];
     51         }
     52         accUpdateDevice();
     53         return *this;
     54     }
     55 
     56     void insert(size_t idx, const T& val)
     57     {
     58         _A[idx] = val;
     59     }
     60     void insert(size_t idx, const T* val)
     61     {
     62         _A[idx] = *val;
     63     }
     64 
     65     void accUpdateSelf()
     66     {
     67         accUpdateSelfT(_A, 0);
     68     }
     69     void accUpdateDevice()
     70     {
     71         accUpdateDeviceT(_A, 0);
     72     }
     73 
     74 private:
     75     T * _A{ nullptr };                      // 数据成员只有指针和长度
     76     size_t _size{ 0 };
     77 
     78     void release()
     79     {
     80         if (_size > 0)
     81         {
     82 #pragma acc exit data delete(_A[0:_size])   // 释放内存时删除设备内存
     83             delete[] _A;
     84             _A = nullptr;
     85             _size = 0;
     86         }
     87     }
     88 
     89     void allocate(size_t size)
     90     {
     91         if (_size != size)                  // 申请内存尺寸与当前尺寸不一致时重新开辟一块
     92         {
     93             release();
     94             _size = size;
     95 #pragma acc update device(_size)
     96             if (_size > 0)
     97             {
     98                 _A = new T[_size];
     99 #ifdef _OPENACC                             // 有 OpenACC 的话检查 _A 是否已经在设备上了
    100                 assert(!acc_is_present(&_A[0], sizeof(T)));
    101 #endif
    102 #pragma acc enter data create(_A[0:_size])  // 在设备上申请新内存
    103             }
    104         }
    105     }
    106 
    107     template<typename U>
    108     void accUpdateSelfT(U *p, long)
    109     {
    110 #pragma acc update self(p[0:_size])
    111     }
    112 
    113     template<typename U>
    114     auto accUpdateSelfT(U *p, int) -> decltype(p->accUpdateSelf())
    115     {
    116         for (size_t j = 0; j < _size; ++j)
    117         {
    118             p[j].accUpdateSelf();
    119         }
    120     }
    121 
    122     template<typename U>
    123     void accUpdateDeviceT(U *p, long)
    124     {
    125 #pragma acc update device(p[0:_size])
    126     }
    127 
    128     template<typename U>
    129     auto accUpdateDeviceT(U *p, int) -> decltype(p->accUpdateDevice())
    130     {
    131         for (size_t j = 0; j < _size; ++j)
    132         {
    133             p[j].accUpdateDevice();
    134         }
    135     }
    136 };
    137 #endif
    138 
    139 // main.cpp
    140 #include <iostream>
    141 #include <cstdlib>
    142 #include <cstdint>
    143 #include "accList.h" 
    144 using namespace std;
    145 #ifndef N
    146 #define N 1024
    147 #endif
    148 
    149 int main()
    150 {
    151     accList<double> A(N), B(N);
    152     for (int i = 0; i < B.size(); ++i)
    153         B[i] = 2.5;
    154     B.accUpdateDevice();                                // 手动更新设备内存
    155 #pragma acc parallel loop gang vector present(A,B)
    156     for (int i = 0; i < A.size(); ++i)
    157         A[i] = B[i] + i;
    158     A.accUpdateSelf();                                  // 手动更新主机内存
    159     for (int i = 0; i<10; ++i)
    160         cout << "A[" << i << "]: " << A[i] << endl;
    161     cout << "......" << endl;
    162     for (int i = N - 10; i<N; ++i)
    163         cout << "A[" << i << "]: " << A[i] << endl;
    164     return 0;
    165 }

    ● 运行结果

    cuan@CUAN:~/acc$ pgc++ main.cpp -o main.exe -Minfo -acc
    main:
         16, Generating present(B,A)
             Generating Tesla code
             18, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    accList<double>::accList(unsigned long):
          4, include "accList.h"
              18, Generating enter data copyin(this[:1])
    accList<double>::~accList():
          4, include "accList.h"
              25, Generating exit data delete(this[:1])
    accList<double>::operator [](unsigned long):
          4, include "accList.h"
              29, Generating acc routine seq
                  Generating Tesla code
    accList<double>::size() const:
          4, include "accList.h"
              40, Generating implicit acc routine seq
                  Generating acc routine seq
                  Generating Tesla code
    accList<double>::release():
          4, include "accList.h"
              82, Generating exit data delete(_A[:_size])
    accList<double>::allocate(unsigned long):
          4, include "accList.h"
              95, Generating update device(_size)
             102, Generating enter data create(_A[:_size])
    void accList<double>::accUpdateSelfT<double>(T1 *, long):
          4, include "accList.h"
             110, Generating update self(p[:_size])
    void accList<double>::accUpdateDeviceT<double>(T1 *, long):
          4, include "accList.h"
             125, Generating update device(p[:_size])
    cuan@CUAN:~/acc$ ./main.exe 
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=16 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128
    A[0]: 2.5
    A[1]: 3.5
    A[2]: 4.5
    A[3]: 5.5
    A[4]: 6.5
    A[5]: 7.5
    A[6]: 8.5
    A[7]: 9.5
    A[8]: 10.5
    A[9]: 11.5
    ......
    A[1014]: 1016.5
    A[1015]: 1017.5
    A[1016]: 1018.5
    A[1017]: 1019.5
    A[1018]: 1020.5
    A[1019]: 1021.5
    A[1020]: 1022.5
    A[1021]: 1023.5
    A[1022]: 1024.5
    A[1023]: 1025.5
    
    Accelerator Kernel Timing data
    /home/cuan/acc/main.cpp
      main  NVIDIA  devicenum=0
        time(us): 13
        16: compute region reached 1 time
            16: kernel launched 1 time
                grid: [1024]  block: [128]
                 device time(us): total=13 max=13 min=13 avg=13
                elapsed time(us): total=329 max=329 min=329 avg=329
        16: data region reached 2 times
    /home/cuan/acc/main.cpp
      _ZN7accListIdEC1Em  NVIDIA  devicenum=0
        time(us): 12
        18: data region reached 2 times
            18: data copyin transfers: 2
                 device time(us): total=12 max=9 min=3 avg=6
    /home/cuan/acc/main.cpp
      _ZN7accListIdED1Ev  NVIDIA  devicenum=0
        time(us): 0
        25: data region reached 2 times
    /home/cuan/acc/main.cpp
      _ZN7accListIdE7releaseEv  NVIDIA  devicenum=0
        time(us): 7
        82: data region reached 2 times
            82: data copyin transfers: 2
                 device time(us): total=7 max=4 min=3 avg=3
    /home/cuan/acc/main.cpp
      _ZN7accListIdE8allocateEm  NVIDIA  devicenum=0
        time(us): 11
        95: update directive reached 2 times
            95: data copyin transfers: 2
                 device time(us): total=5 max=3 min=2 avg=2
        102: data region reached 2 times
            102: data copyin transfers: 2
                 device time(us): total=6 max=3 min=3 avg=3
    /home/cuan/acc/main.cpp
      _ZN7accListIdE14accUpdateSelfTIdEEvPT_l  NVIDIA  devicenum=0
        time(us): 10
        110: update directive reached 1 time
            110: data copyout transfers: 1
                 device time(us): total=10 max=10 min=10 avg=10
    /home/cuan/acc/main.cpp
      _ZN7accListIdE16accUpdateDeviceTIdEEvPT_l  NVIDIA  devicenum=0
        time(us): 3
        125: update directive reached 1 time
            125: data copyin transfers: 1
                 device time(us): total=3 max=3 min=3 avg=3
  • 相关阅读:
    (转载)Android多渠道打包飞一般的感觉
    (转载)【笨木头Lua专栏】基础补充22:弱引用table
    (转载)【笨木头Lua专栏】基础补充21:面向对象——多重继承、私密性
    (转载)【笨木头Lua专栏】基础补充20:面向对象——类和继承
    (转载)【笨木头Lua专栏】基础补充19:调用函数时用点号还是用冒号
    (转载)【笨木头Lua专栏】基础补充18:Lua的模块编写与module函数
    (转载)【笨木头Lua专栏】基础补充16:__index和__newindex的沉默与合作
    (转载)【笨木头Lua专栏】基础补充17:全局变量与非全局环境
    自定义web浏览器(五)
    第九课时之错误和异常
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9416788.html
Copyright © 2020-2023  润新知