• 0_Simple__UnifiedMemoryStreams


    使用 OpenMP 和 pthreads 两种环境,利用实现统一内存编址,计算基本的矩阵乘法 result = α * A * x + β * result 。

    ▶ 源代码

      1 #include <cstdio>
      2 #include <vector>
      3 #include <algorithm>
      4 #include <cuda_runtime.h>
      5 #include "device_launch_parameters.h"
      6 #include <cublas_v2.h>
      7 
      8 //#define USE_PTHREADS // 使用 pthread 时补充定义 USE_PTHREADS
      9 #ifdef USE_PTHREADS
     10     #include <pthread.h>
     11     #pragma comment(lib, "pthreadVC2.lib")
     12 #else
     13     #include <omp.h>
     14 #endif
     15 
     16 // Windows 系统需要构造与函数 SRAND48 和 DRAND48 等价的随机函数
     17 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
     18 void srand48(long seed) { srand((unsigned int)seed); }
     19 double drand48() { return double(rand()) / RAND_MAX; }
     20 #endif
     21 
     22 template <typename T> struct Task// struct 也可使用类的构造和析构函数
     23 {
     24     unsigned int size, id;
     25     T *data;
     26     T *result;
     27     T *vector;
     28 
     29     Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL) {};
     30     Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL)
     31     {
     32         cudaMallocManaged(&data, sizeof(T)*size*size);
     33         cudaMallocManaged(&result, sizeof(T)*size);
     34         cudaMallocManaged(&vector, sizeof(T)*size);
     35         cudaDeviceSynchronize();
     36     }
     37 
     38     ~Task()
     39     {
     40         cudaDeviceSynchronize();
     41         cudaFree(data);
     42         cudaFree(result);
     43         cudaFree(vector);
     44     }
     45 
     46     void allocate(const unsigned int s, const unsigned int unique_id)// 申请内存,初始化各成员数组
     47     {
     48         id = unique_id;
     49         size = s;
     50         cudaMallocManaged(&data, sizeof(T)*size*size);
     51         cudaMallocManaged(&result, sizeof(T)*size);
     52         cudaMallocManaged(&vector, sizeof(T)*size);
     53         cudaDeviceSynchronize();
     54 
     55         for (int i = 0; i < size*size; i++)
     56             data[i] = drand48();
     57         for (int i = 0; i < size; i++)
     58         {
     59             result[i] = 0.;
     60             vector[i] = drand48();
     61         }
     62     }
     63 };
     64 
     65 #ifdef USE_PTHREADS// 封装 pthread 型的任务
     66 struct threadData_t
     67 {
     68     int tid;
     69     Task<double> *TaskListPtr;
     70     cudaStream_t *streams;
     71     cublasHandle_t *handles;
     72     int taskSize;
     73 };
     74 
     75 typedef struct threadData_t threadData;
     76 #endif
     77 
     78 template <typename T> void gemv(int m, int n, T *alpha, T *A, T *x, T *beta, T *result)// 计算 result = α * A * x + β * result
     79 {
     80     for (int i = 0; i < m; i++)// 源代码这写成了 n,并且漏掉了后面的 alpha
     81     {
     82         result[i] *= *beta; 
     83         for (int j = 0; j < n; j++)
     84             result[i] += *alpha * A[i*n + j] * x[j]; 
     85     }
     86 }
     87 
     88 // execute a single task on either host or device depending on size
     89 #ifdef USE_PTHREADS
     90 void * execute(void* inpArgs)
     91 {
     92     threadData *dataPtr    = (threadData *) inpArgs;
     93     cudaStream_t *stream   = dataPtr->streams;
     94     cublasHandle_t *handle = dataPtr->handles;
     95     int tid                = dataPtr->tid;
     96 
     97     for (int i = 0; i < dataPtr->taskSize; i++)
     98     {
     99         Task<double>  &t = dataPtr->TaskListPtr[i]; 
    100         double alpha = 1.0;
    101         double beta = 0.0;
    102         if (t.size < 100)// 数据规模较小在主机上运行,否则在设备上运行
    103         {
    104             printf("
    Task [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
    105             cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost);
    106             cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost);
    107             cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost);
    108             cudaStreamSynchronize(stream[0]);
    109             gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
    110         }
    111         else
    112         {
    113             printf("
    Task [%2d], thread [%2d], size [%4d], on device",t.id,tid,t.size);
    114             cublasSetStream(handle[tid+1], stream[tid+1]);
    115             cudaStreamAttachMemAsync(stream[tid+1], t.data, 0, cudaMemAttachSingle);
    116             cudaStreamAttachMemAsync(stream[tid+1], t.vector, 0, cudaMemAttachSingle);
    117             cudaStreamAttachMemAsync(stream[tid+1], t.result, 0, cudaMemAttachSingle);
    118             cublasDgemv(handle[tid+1], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, 1, &beta, t.result, 1);
    119         }
    120     }
    121     return NULL;
    122 }
    123 #else
    124 template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
    125 {
    126     double alpha = 1.0;
    127     double beta = 0.0;
    128     if (t.size < 100)// 数据规模较小在主机上运行,否则在设备上运行
    129     {
    130         printf("
    Task [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
    131         cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost);
    132         cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost);
    133         cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost);
    134         cudaStreamSynchronize(stream[0]);
    135         gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
    136     }
    137     else
    138     {
    139         printf("
    Task [%2d], thread [%2d], size[%4d], on device",t.id,tid,t.size);
    140         cublasSetStream(handle[tid+1], stream[tid+1]);
    141         cudaStreamAttachMemAsync(stream[tid+1], t.data, 0, cudaMemAttachSingle);
    142         cudaStreamAttachMemAsync(stream[tid+1], t.vector, 0, cudaMemAttachSingle);
    143         cudaStreamAttachMemAsync(stream[tid+1], t.result, 0, cudaMemAttachSingle);
    144         cublasDgemv(handle[tid+1], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, 1, &beta, t.result, 1);
    145     }
    146 }
    147 #endif
    148 
    149 template <typename T> void initialise_tasks(std::vector< Task<T> > &TaskList)
    150 {
    151     for (unsigned int i = 0; i < TaskList.size(); i++)
    152     {
    153         int size;
    154         size = std::max((int)(drand48()*1000.0), 64);
    155         TaskList[i].allocate(size, i);
    156     }
    157 }
    158 
    159 int main()
    160 {
    161     printf("
    	Start.
    ");
    162 
    163     cudaDeviceProp device_prop;
    164     cudaGetDeviceProperties(&device_prop, 0);
    165     if (!device_prop.managedMemory)
    166     { 
    167         printf("
    	Unified Memory not supported
    ");
    168         getchar();
    169         return 1;
    170     }
    171     if (device_prop.computeMode == cudaComputeModeProhibited)// Device 为线程禁用模式
    172     {
    173         printf("
    	ComputeMode is cudaComputeModeProhibited
    ");
    174         getchar();
    175         return 1;
    176     }
    177 
    178     srand48(time(NULL));
    179     const int nthreads = 4;
    180     cudaStream_t *streams = new cudaStream_t[nthreads+1];
    181     cublasHandle_t *handles = new cublasHandle_t[nthreads+1];
    182     for (int i=0; i<nthreads+1; i++)
    183     {
    184         cudaStreamCreate(&streams[i]);
    185         cublasCreate(&handles[i]);
    186     }
    187 
    188     unsigned int N = 40;
    189     std::vector<Task<double> > TaskList(N);
    190     initialise_tasks(TaskList);
    191     cudaSetDevice(0);
    192 
    193 #ifdef USE_PTHREADS
    194     pthread_t threads[nthreads];
    195     threadData *InputToThreads = new threadData[nthreads];
    196     int temp = TaskList.size() / nthreads;
    197     for (int i=0; i < nthreads; i++)
    198     {
    199         InputToThreads[i].tid = i; 
    200         InputToThreads[i].streams = streams;
    201         InputToThreads[i].handles = handles;
    202 
    203         if (temp == 0)  // 任务数量比线程数少
    204         {
    205             InputToThreads[i].taskSize = 0; 
    206             InputToThreads[i].TaskListPtr = &TaskList[0];
    207         }
    208         else            // 任务数量不少于线程数。任务尽量均分,多出的零头全部塞给最后一个线程
    209         {
    210             if (i == nthreads - 1)
    211             {
    212                 InputToThreads[i].taskSize = temp + (TaskList.size() % nthreads); 
    213                 InputToThreads[i].TaskListPtr = &TaskList[i*temp + (TaskList.size() % nthreads)];
    214             }
    215             else
    216             {
    217                 InputToThreads[i].taskSize = temp; 
    218                 InputToThreads[i].TaskListPtr = &TaskList[i*temp];
    219             }
    220         }
    221         pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
    222     }
    223     for (int i=0; i < nthreads; i++)
    224         pthread_join(threads[i], NULL);
    225 #else
    226     omp_set_num_threads(nthreads);
    227     #pragma omp parallel for schedule(dynamic)
    228     for (int i=0; i<TaskList.size(); i++)
    229     {
    230         int tid = omp_get_thread_num();
    231         execute(TaskList[i], handles, streams, tid);
    232     }
    233 #endif
    234     cudaDeviceSynchronize();
    235 
    236     // 清理工作
    237     for (int i=0; i<nthreads+1; i++)
    238     {
    239         cudaStreamDestroy(streams[i]);
    240         cublasDestroy(handles[i]);
    241     }
    242     std::vector< Task<double> >().swap(TaskList);
    243     printf("
    	Finish.
    ");
    244     getchar();
    245     return 0;
    246 }

    ▶ 输出结果:OpenMP

        Start.
    
    Task [ 0], thread [ 0], size[ 721], on device
    Task [ 1], thread [ 2], size[ 925], on device
    Task [ 2], thread [ 3], size[ 250], on device
    Task [ 3], thread [ 1], size[ 832], on device
    Task [ 4], thread [ 2], size[ 798], on device
    Task [ 5], thread [ 3], size[ 379], on device
    Task [ 6], thread [ 2], size[ 110], on device
    Task [ 7], thread [ 1], size[ 147], on device
    Task [ 8], thread [ 0], size[ 941], on device
    Task [ 9], thread [ 3], size[ 494], on device
    Task [12], thread [ 1], size[ 619], on device
    Task [10], thread [ 0], size[ 144], on device
    Task [11], thread [ 2], size[ 477], on device
    Task [13], thread [ 3], size[ 282], on device
    Task [14], thread [ 0], size[ 593], on device
    Task [15], thread [ 1], size[ 960], on device
    Task [16], thread [ 0], size[ 671], on device
    Task [17], thread [ 2], size[ 585], on device
    Task [18], thread [ 3], size[ 883], on device
    Task [19], thread [ 0], size[ 102], on device
    Task [20], thread [ 2], size[ 912], on device
    Task [21], thread [ 1], size[ 527], on device
    Task [22], thread [ 2], size[ 613], on device
    Task [23], thread [ 3], size[ 553], on device
    Task [24], thread [ 0], size[ 572], on device
    Task [25], thread [ 1], size[ 792], on device
    Task [26], thread [ 2], size[ 406], on device
    Task [27], thread [ 3], size[ 782], on device
    Task [28], thread [ 0], size[ 351], on device
    Task [29], thread [ 1], size[ 595], on device
    Task [30], thread [ 2], size[ 301], on device
    Task [31], thread [ 3], size[ 537], on device
    Task [32], thread [ 0], size[ 303], on device
    Task [35], thread [ 3], size[ 177], on device
    Task [34], thread [ 2], size[ 869], on device
    Task [33], thread [ 1], size[ 415], on device
    Task [36], thread [ 0], size[ 987], on device
    Task [37], thread [ 3], size[ 772], on device
    Task [38], thread [ 2], size[ 129], on device
    Task [39], thread [ 1], size[ 506], on device
        Finish.

    ▶ 输出结果:pthreads

        Start.
    
    Task [ 0], thread [ 0], size [ 755], on device
    Task [20], thread [ 2], size [ 754], on device
    Task [10], thread [ 1], size [ 746], on device
    Task [30], thread [ 3], size [ 958], on device
    Task [31], thread [ 3], size [ 281], on device
    Task [11], thread [ 1], size [ 946], on device
    Task [ 1], thread [ 0], size [  77], on host
    Task [32], thread [ 3], size [ 144], on device
    Task [21], thread [ 2], size [ 432], on device
    Task [12], thread [ 1], size [ 105], on device
    Task [22], thread [ 2], size [ 837], on device
    Task [33], thread [ 3], size [ 402], on device
    Task [ 2], thread [ 0], size [ 239], on device
    Task [13], thread [ 1], size [ 149], on device
    Task [23], thread [ 2], size [ 764], on device
    Task [ 3], thread [ 0], size [ 775], on device
    Task [34], thread [ 3], size [ 913], on device
    Task [24], thread [ 2], size [ 182], on device
    Task [35], thread [ 3], size [ 865], on device
    Task [ 4], thread [ 0], size [ 729], on device
    Task [14], thread [ 1], size [ 110], on device
    Task [ 5], thread [ 0], size [ 604], on device
    Task [36], thread [ 3], size [ 107], on device
    Task [25], thread [ 2], size [ 403], on device
    Task [15], thread [ 1], size [ 842], on device
    Task [37], thread [ 3], size [ 858], on device
    Task [26], thread [ 2], size [  64], on host
    Task [ 6], thread [ 0], size [  64], on host
    Task [16], thread [ 1], size [ 103], on device
    Task [ 7], thread [ 0], size [ 852], on device
    Task [27], thread [ 2], size [ 826], on device
    Task [ 8], thread [ 0], size [ 673], on device
    Task [17], thread [ 1], size [ 243], on device
    Task [38], thread [ 3], size [ 178], on device
    Task [39], thread [ 3], size [ 788], on device
    Task [ 9], thread [ 0], size [  67], on host
    Task [18], thread [ 1], size [ 340], on device
    Task [28], thread [ 2], size [ 249], on device
    Task [19], thread [ 1], size [ 934], on device
    Task [29], thread [ 2], size [ 793], on device
        Finish.

    ▶ 涨姿势:

    ● 使用 C++ 结构体完成了类似类的方法。即在结构体中定义构造函数、析构函数及其他方法。

    ● 使用了 cuBLAS 库,注意句柄的使用和库函数的调用。

    ● 用到的申请内存的函数

     1 // driver_types.h
     2 #define cudaMemAttachGlobal 0x01  // 可访问内存
     3 #define cudaMemAttachHost   0x02  // 不可访问内存
     4 #define cudaMemAttachSingle 0x04  // 单线程可访问内存
     5 
     6 // cuda_runtime.h
     7 template<class T> static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, T *devPtr, size_t length = 0, unsigned int flags = cudaMemAttachSingle)
     8 {
     9     return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
    10 }
    11 
    12 // cuda_runtime_api.h
    13 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags __dv(cudaMemAttachSingle));
  • 相关阅读:
    网站备份list
    vnc checklist
    appnode iptables 规则后面覆盖前面的
    Appnode + Discuz checklist
    解决WORD文档无法显示链接的图像问题
    应用容器Application container
    要研究的内容
    转 Flex MXML编译成AS类
    Flex文件结构
    int a
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/8012435.html
Copyright © 2020-2023  润新知