• cuda并行计算的几种模式


      1 #include "cuda_runtime.h"
      2 #include "device_launch_parameters.h"
      3 #include <stdio.h>
      4 #include <time.h>
      5 #include <stdlib.h>
      6 
      7 #define MAX 120
      8 #define MIN 0
      9 
     10 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,
     11         float* etime);
     12 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,
     13         float* etime, int type);
     14 __global__ void addKernel(int *c, const int *a, const int *b) {
     15     int i = blockIdx.x;
     16     c[i] = a[i] + b[i];
     17 }
     18 
     19 __global__ void addKernelThread(int *c, const int *a, const int *b) {
     20     int i = threadIdx.x;
     21     c[i] = a[i] + b[i];
     22 }
     23 int main() {
     24     const int arraySize = 800;
     25     srand((unsigned) time(NULL));
     26     int a[arraySize] = { 1, 2, 3, 4, 5 };
     27     int b[arraySize] = { 10, 20, 30, 40, 50 };
     28 
     29     for (int i = 0; i < arraySize; i++) {
     30         a[i] = rand() % (MAX + 1 - MIN) + MIN;
     31         b[i] = rand() % (MAX + 1 - MIN) + MIN;
     32     }
     33     int c[arraySize] = { 0 };
     34     // Add vectors in parallel.
     35     cudaError_t cudaStatus;
     36     int num = 0;
     37     cudaDeviceProp prop;
     38     cudaStatus = cudaGetDeviceCount(&num);
     39     for (int i = 0; i < num; i++) {
     40         cudaGetDeviceProperties(&prop, i);
     41     }
     42 
     43     float time;
     44     cudaStatus = addWithCudaStream(c, a, b, arraySize, &time);
     45     printf("Elasped time of stream is : %f 
    ", time);
     46     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}
    ",
     47             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2],
     48             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0],
     49             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3],
     50             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1],
     51             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]);
     52     if (cudaStatus != cudaSuccess) {
     53         fprintf(stderr, "addWithCudaStream failed!");
     54         return 1;
     55     }
     56     cudaStatus = addWithCuda(c, a, b, arraySize, &time, 0);
     57     printf("Elasped time of Block is : %f 
    ", time);
     58     if (cudaStatus != cudaSuccess) {
     59         fprintf(stderr, "addWithCudaStream failed!");
     60         return 1;
     61     }
     62     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}
    ",
     63             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2],
     64             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0],
     65             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3],
     66             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1],
     67             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]);
     68 
     69     cudaStatus = addWithCuda(c, a, b, arraySize, &time, 1);
     70     printf("Elasped time of thread is : %f 
    ", time);
     71     if (cudaStatus != cudaSuccess) {
     72         fprintf(stderr, "addWithCudaStream failed!");
     73         return 1;
     74     }
     75     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}
    ",
     76             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2],
     77             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0],
     78             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3],
     79             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1],
     80             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]);
     81 
     82     cudaStatus = addWithCudaStream(c, a, b, arraySize, &time);
     83     printf("Elasped time of stream is : %f 
    ", time);
     84     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}
    ",
     85             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2],
     86             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0],
     87             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3],
     88             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1],
     89             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]);
     90     if (cudaStatus != cudaSuccess) {
     91         fprintf(stderr, "addWithCudaStream failed!");
     92         return 1;
     93     }
     94     // cudaThreadExit must be called before exiting in order for profiling and
     95     // tracing tools such as Nsight and Visual Profiler to show complete traces.
     96     cudaStatus = cudaThreadExit();
     97     if (cudaStatus != cudaSuccess) {
     98         fprintf(stderr, "cudaThreadExit failed!");
     99         return 1;
    100     }
    101     return 0;
    102 }
    103 // Helper function for using CUDA to add vectors in parallel.
    104 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,
    105         float* etime) {
    106     int *dev_a = 0;
    107     int *dev_b = 0;
    108     int *dev_c = 0;
    109     clock_t start, stop;
    110     float time;
    111     cudaError_t cudaStatus;
    112 
    113     // Choose which GPU to run on, change this on a multi-GPU system.
    114     cudaStatus = cudaSetDevice(0);
    115     if (cudaStatus != cudaSuccess) {
    116         fprintf(stderr,
    117                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    118         goto Error;
    119     }
    120     // Allocate GPU buffers for three vectors (two input, one output)    .
    121     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));
    122     if (cudaStatus != cudaSuccess) {
    123         fprintf(stderr, "cudaMalloc failed!");
    124         goto Error;
    125     }
    126     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));
    127     if (cudaStatus != cudaSuccess) {
    128         fprintf(stderr, "cudaMalloc failed!");
    129         goto Error;
    130     }
    131     cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));
    132     if (cudaStatus != cudaSuccess) {
    133         fprintf(stderr, "cudaMalloc failed!");
    134         goto Error;
    135     }
    136     // Copy input vectors from host memory to GPU buffers.
    137     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
    138             cudaMemcpyHostToDevice);
    139     if (cudaStatus != cudaSuccess) {
    140         fprintf(stderr, "cudaMemcpy failed!");
    141         goto Error;
    142     }
    143     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),
    144             cudaMemcpyHostToDevice);
    145     if (cudaStatus != cudaSuccess) {
    146         fprintf(stderr, "cudaMemcpy failed!");
    147         goto Error;
    148     }
    149     cudaStream_t stream[5];
    150     for (int i = 0; i < 5; i++) {
    151         cudaStreamCreate(&stream[i]);   //创建流
    152     }
    153     // Launch a kernel on the GPU with one thread for each element.
    154     for (int i = 0; i < 5; i++) {
    155         addKernel<<<1, 1, 0, stream[i]>>>(dev_c + i, dev_a + i, dev_b + i); //执行流
    156     }
    157     start = clock();
    158     cudaDeviceSynchronize();
    159     stop = clock();
    160     time = (float) (stop - start) / CLOCKS_PER_SEC;
    161     *etime = time;
    162     // cudaThreadSynchronize waits for the kernel to finish, and returns
    163     // any errors encountered during the launch.
    164     cudaStatus = cudaThreadSynchronize();
    165     if (cudaStatus != cudaSuccess) {
    166         fprintf(stderr,
    167                 "cudaThreadSynchronize returned error code %d after launching addKernel!
    ",
    168                 cudaStatus);
    169         goto Error;
    170     }
    171     // Copy output vector from GPU buffer to host memory.
    172     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
    173             cudaMemcpyDeviceToHost);
    174     if (cudaStatus != cudaSuccess) {
    175         fprintf(stderr, "cudaMemcpy failed!");
    176         goto Error;
    177     }
    178     Error: for (int i = 0; i < 5; i++) {
    179         cudaStreamDestroy(stream[i]);   //销毁流
    180     }
    181     cudaFree(dev_c);
    182     cudaFree(dev_a);
    183     cudaFree(dev_b);
    184     return cudaStatus;
    185 }
    186 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,
    187         float * etime, int type) {
    188     int *dev_a = 0;
    189     int *dev_b = 0;
    190     int *dev_c = 0;
    191     clock_t start, stop;
    192     float time;
    193     cudaError_t cudaStatus;
    194 
    195     // Choose which GPU to run on, change this on a multi-GPU system.
    196     cudaStatus = cudaSetDevice(0);
    197     if (cudaStatus != cudaSuccess) {
    198         fprintf(stderr,
    199                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    200         goto Error;
    201     }
    202     // Allocate GPU buffers for three vectors (two input, one output)    .
    203     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));
    204     if (cudaStatus != cudaSuccess) {
    205         fprintf(stderr, "cudaMalloc failed!");
    206         goto Error;
    207     }
    208     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));
    209     if (cudaStatus != cudaSuccess) {
    210         fprintf(stderr, "cudaMalloc failed!");
    211         goto Error;
    212     }
    213     cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));
    214     if (cudaStatus != cudaSuccess) {
    215         fprintf(stderr, "cudaMalloc failed!");
    216         goto Error;
    217     }
    218     // Copy input vectors from host memory to GPU buffers.
    219     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
    220             cudaMemcpyHostToDevice);
    221     if (cudaStatus != cudaSuccess) {
    222         fprintf(stderr, "cudaMemcpy failed!");
    223         goto Error;
    224     }
    225     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),
    226             cudaMemcpyHostToDevice);
    227     if (cudaStatus != cudaSuccess) {
    228         fprintf(stderr, "cudaMemcpy failed!");
    229         goto Error;
    230     }
    231 
    232     if (type == 0) {
    233         start = clock();
    234         addKernel<<<size, 1>>>(dev_c, dev_a, dev_b);
    235     } else {
    236         start = clock();
    237         addKernelThread<<<1, size>>>(dev_c, dev_a, dev_b);
    238     }
    239     stop = clock();
    240     time = (float) (stop - start) / CLOCKS_PER_SEC;
    241     *etime = time;
    242     // cudaThreadSynchronize waits for the kernel to finish, and returns
    243     // any errors encountered during the launch.
    244     cudaStatus = cudaThreadSynchronize();
    245     if (cudaStatus != cudaSuccess) {
    246         fprintf(stderr,
    247                 "cudaThreadSynchronize returned error code %d after launching addKernel!
    ",
    248                 cudaStatus);
    249         goto Error;
    250     }
    251     // Copy output vector from GPU buffer to host memory.
    252     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
    253             cudaMemcpyDeviceToHost);
    254     if (cudaStatus != cudaSuccess) {
    255         fprintf(stderr, "cudaMemcpy failed!");
    256         goto Error;
    257     }
    258     Error: cudaFree(dev_c);
    259     cudaFree(dev_a);
    260     cudaFree(dev_b);
    261     return cudaStatus;
    262 }
    View Code

    如上文的实现程序,使用了thread并行,block并行,stream并行三种,使用三种方法法进行了五次计算,发现stream第一次计算时会出错,调用的子程序没有变化,没有搞懂?

    Elasped time of stream is : 0.000006
    {47,86,67,35,16} + {114,39,110,20,101} = {158,123,92,107,127}
    Elasped time of Block is : 0.000006
    {47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
    Elasped time of stream is : 0.000008
    {47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
    Elasped time of thread is : 0.000004
    {47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
    Elasped time of stream is : 0.000007
    {47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}

    OPTIMISM, PASSION & HARDWORK
  • 相关阅读:
    MVC布局页占位符@RenderSection("bscript", false)
    HtmlHelp
    MVC 笔记(二)
    mvc 客户端验证
    mvc ajax请求
    mvc 笔记
    mvc 微软票据验证
    内养外调美女养生方
    机械设备维修技术(第2版)(普通高等教育“十一五”国家级规划教材)
    石油特种车载设备结构与维护
  • 原文地址:https://www.cnblogs.com/hiramlee0534/p/5934923.html
Copyright © 2020-2023  润新知