• cuda_c学习笔记-向量加法


    用cuda计算向量加法A+B=C

    流程:

    1.申请主机内存。向量A,向量B,计算结果C

    2.初始化数据。用0-1之间的随机数初始化向量A,B,C

    3.GPU内存申请。申请A,B,C需要的GPU内存空间

    4.数据拷贝。把数据从主机内存拷贝至GPU内存

    5.计算需要的线程数和线程块数。

    6.调用GPU加法函数

    7.数据拷贝。把结果从GPU内存拷贝至主机内存。

    8.在CPU上重新运行一遍,与GPU结果进行对照。

    9.释放GPU内存。

    10.释放主机内存。

    11.重置GPU状态。

    要点:内存管理,数据拷贝。

    代码:

      1 #include <stdio.h>
      2 #include <cuda_runtime.h>
      3 __global__ void
      4 vectorAdd(const float *A, const float *B, float *C, int numElements)
      5 {
      6     int i = blockDim.x * blockIdx.x + threadIdx.x;
      7 
      8     if (i < numElements)
      9     {
     10         C[i] = A[i] + B[i];
     11     }
     12 }
     13 
     14 int main(void)
     15 {
     16     //检测cuda返回值
     17     cudaError_t err = cudaSuccess;
     18 
     19     //初始化向量维度
     20     int numElements = 50000;
     21     //计算内存需求
     22     size_t size = numElements * sizeof(float);
     23 
     24     printf("[Vector addition of %d elements ]
    ", numElements);
     25 
     26     //
     27     // 对主机的A,B,C申请内存空间
     28     float *host_A = (float *)malloc(size);
     29     float *host_B = (float *)malloc(size);
     30     float *host_C = (float *)malloc(size);
     31     //判断是否申请成功
     32     if (host_A == NULL || host_B == NULL || host_C == NULL)
     33     {
     34         fprintf(stderr, "Failed to allocate host vectors!
    ");
     35         exit(EXIT_FAILURE);
     36     }
     37     // 初始化主机A,B
     38     for (int i = 0; i < numElements; ++i)
     39     {
     40         host_A[i] = rand()/(float)RAND_MAX;
     41         host_B[i] = rand()/(float)RAND_MAX;
     42     }
     43 
     44     //
     45     //申请cuda内存空间并判断
     46     float *device_A = NULL;
     47     err = cudaMalloc((void **)&device_A, size);
     48     if (err != cudaSuccess)
     49     {
     50         fprintf(stderr, "对向量A申请cuda内存空间失败 (错误代码 %s)!
    ", cudaGetErrorString(err));
     51         exit(EXIT_FAILURE);
     52     }
     53     float *device_B = NULL;
     54     err = cudaMalloc((void **)&device_B, size);
     55     if (err != cudaSuccess)
     56     {
     57         fprintf(stderr, "对向量B申请cuda内存空间失败 (错误代码 %s)!
    ", cudaGetErrorString(err));
     58         exit(EXIT_FAILURE);
     59     }
     60     float *device_C = NULL;
     61     err = cudaMalloc((void **)&device_C, size);
     62     if (err != cudaSuccess)
     63     {
     64         fprintf(stderr, "对向量C申请cuda内存空间失败 (错误代码 %s)!
    ", cudaGetErrorString(err));
     65         exit(EXIT_FAILURE);
     66     }
     67     //拷贝数据到cuda内存并检测
     68     printf("Copy input data from the host memory to the CUDA device
    ");
     69     err = cudaMemcpy(device_A, host_A, size, cudaMemcpyHostToDevice);
     70     if (err != cudaSuccess)
     71     {
     72         fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!
    ", cudaGetErrorString(err));
     73         exit(EXIT_FAILURE);
     74     }
     75     err = cudaMemcpy(device_B, host_B, size, cudaMemcpyHostToDevice);
     76     if (err != cudaSuccess)
     77     {
     78         fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!
    ", cudaGetErrorString(err));
     79         exit(EXIT_FAILURE);
     80     }
     81 
     82     //计算线程块与线程
     83     //每线程块线程数
     84     int threadsPerBlock = 256;
     85     //每网格线程块数
     86     int blocksPerGrid = (numElements + threadsPerBlock - 1)/ threadsPerBlock;
     87     printf("CUDA kernel launch with %d blocks of %d threads
    ", blocksPerGrid, threadsPerBlock);
     88     vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(device_A, device_B, device_C, numElements);
     89     //判断cuda程序运行情况
     90     err = cudaGetLastError();
     91     if (err != cudaSuccess)
     92     {
     93         fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!
    ", cudaGetErrorString(err));
     94         exit(EXIT_FAILURE);
     95     }
     96 
     97     //计算结果拷贝回主机
     98     printf("Copy output data from the CUDA device to the host memory
    ");
     99     err = cudaMemcpy(host_C, device_C, size, cudaMemcpyDeviceToHost);
    100     if (err != cudaSuccess)
    101     {
    102         fprintf(stderr,"计算结果拷贝回主机失败(错误代码:%s)
    ",cudaGetErrorString(err));
    103         exit(EXIT_FAILURE);
    104     }
    105 
    106     //结果验证
    107     for(int i = 0; i < numElements; ++i)
    108     {
    109         if(fabs(host_A[i] + host_B[i] - host_C[i]) > 1e-5)
    110         {
    111             fprintf(stderr,"验证失败%d
    ",i);
    112             exit(EXIT_FAILURE);
    113         }
    114     }
    115     printf("验证成功
    ");
    116 
    117     //释放cuda内存和主机内存
    118     err = cudaFree(device_A);
    119     err = cudaFree(device_B);
    120     err = cudaFree(device_C);
    121     free(host_A);
    122     free(host_B);
    123     free(host_C);
    124 
    125     //重置cuda状态
    126     err = cudaDeviceReset();
    127     printf("结束");
    128     return 0;
    129 }

    好烦……GPU搞起来真麻烦

  • 相关阅读:
    剑指offer——斐波那契数列
    剑指offer——用两个栈实现队列
    剑指offer——二维数组中的查找
    LeetCode第九题—— Palindrome Number(判断回文数)
    java 面试题汇总
    idea设置方法注释
    解决java.lang.SecurityException: Invalid signature file digest for Manifest main attributes
    Timer和TimerTask详解
    java8函数式接口(Functional Interface)
    Python执行选择性粘贴
  • 原文地址:https://www.cnblogs.com/nwpuxuezha/p/4468860.html
Copyright © 2020-2023  润新知