• 0_Simple__vectorAdd + 0_Simple__vectorAdd_nvrtc + 0_Simple__vectorAddDrv


    ▶ 使用 CUDA Runtime API,运行时编译,Driver API 三种接口计算向量加法

    ▶ 源代码,CUDA Runtime API

     1 #include <stdio.h>
     2 #include <cuda_runtime.h>
     3 #include "device_launch_parameters.h"
     4 #include <helper_cuda.h>
     5 
     6 #define ELEMENT 50000
     7 
     8 __global__ void vectorAdd(const float *A, const float *B, float *C, int size)
     9 {
    10     int i = blockDim.x * blockIdx.x + threadIdx.x;
    11     if (i < size)
    12         C[i] = A[i] + B[i];
    13 }
    14 
    15 int main()
    16 {
    17     printf("	Start.
    ");    
    18     size_t size = ELEMENT * sizeof(float);
    19 
    20     float *h_A = (float *)malloc(size);
    21     float *h_B = (float *)malloc(size);
    22     float *h_C = (float *)malloc(size);
    23     float *d_A = NULL;
    24     float *d_B = NULL;
    25     float *d_C = NULL;
    26     cudaMalloc((void **)&d_A, size);
    27     cudaMalloc((void **)&d_B, size);
    28     cudaMalloc((void **)&d_C, size);
    29     for (int i = 0; i < ELEMENT; ++i)
    30     {
    31         h_A[i] = rand() / (float)RAND_MAX;
    32         h_B[i] = rand() / (float)RAND_MAX;
    33     }
    34     cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    35     cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    36 
    37     int threadsPerBlock = 256;
    38     int blocksPerGrid = (ELEMENT + threadsPerBlock - 1) / threadsPerBlock;
    39     vectorAdd << <blocksPerGrid, threadsPerBlock >> > (d_A, d_B, d_C, ELEMENT);
    40     cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    41 
    42     for (int i = 0; i < ELEMENT; ++i)
    43     {
    44         if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
    45         {
    46             printf("
    	Result error at i = %d, h_A[i] = %f, h_B[i] = %f, h_C[i] = %f
    ", i, h_A[i], h_B[i], h_C[i]);
    47             getchar();
    48             return 1;
    49         }
    50     }
    51 
    52     free(h_A);
    53     free(h_B);
    54     free(h_C);
    55     cudaFree(d_A);
    56     cudaFree(d_B);
    57     cudaFree(d_C);
    58     printf("
    	Finish.
    ");
    59     getchar();    
    60     return 0;
    61 }

    ● 输出结果:

        Start.
    
        Finish.

    ▶ 源代码,运行时编译

    1 // vectorAdd_kernel.cu
    2 extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int size)
    3 {
    4     int i = blockDim.x * blockIdx.x + threadIdx.x;
    5     if (i < size)
    6         C[i] = A[i] + B[i];
    7 }
     1 // vectorAdd.cpp
     2 #include <stdio.h>
     3 #include <cuda_runtime.h>
     4 #include "device_launch_parameters.h"
     5 #include <cuda.h>
     6 #include <nvrtc_helper.h>
     7 
     8 #define ELEMENT 50000
     9 
    10 int main()
    11 {
    12     printf("
    	Start.
    ");
    13 
    14     char *ptx, *kernel_file;
    15     size_t ptxSize;
    16     kernel_file = "D:\Program\CUDA9.0\Samples\0_Simple\vectorAdd_nvrtc\vectorAdd_kernel.cu";
    17     compileFileToPTX(kernel_file, 1, NULL, &ptx, &ptxSize, 0);
    18     CUmodule module = loadPTX(ptx, 1, NULL);
    19     CUfunction kernel_addr;
    20     cuModuleGetFunction(&kernel_addr, module, "vectorAdd");
    21 
    22     size_t size = ELEMENT * sizeof(float);
    23 
    24     float *h_A = (float *)malloc(size);
    25     float *h_B = (float *)malloc(size);
    26     float *h_C = (float *)malloc(size);
    27     CUdeviceptr d_A, d_B, d_C;
    28     cuMemAlloc(&d_A, size);
    29     cuMemAlloc(&d_B, size);
    30     cuMemAlloc(&d_C, size);    
    31     for (int i = 0; i < ELEMENT; ++i)
    32     {
    33         h_A[i] = rand()/(float)RAND_MAX;
    34         h_B[i] = rand()/(float)RAND_MAX;
    35     }
    36     cuMemcpyHtoD(d_A, h_A, size);
    37     cuMemcpyHtoD(d_B, h_B, size);
    38 
    39     int threadsPerBlock = 256; 
    40     dim3 cudaBlockSize(threadsPerBlock,1,1);
    41     dim3 cudaGridSize((ELEMENT + threadsPerBlock - 1) / threadsPerBlock, 1, 1);
    42     int element = ELEMENT;
    43     void *arr[] = { (void *)&d_A, (void *)&d_B, (void *)&d_C, (void *)&element};
    44     cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, 0, 0, &arr[0], 0);
    45     cuCtxSynchronize();
    46     cuMemcpyDtoH(h_C, d_C, size);
    47 
    48     for (int i = 0; i < ELEMENT; ++i)
    49     {
    50         if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
    51         {
    52             printf("
    	Result error at i = %d, h_A[i] = %f, h_B[i] = %f, h_C[i] = %f
    ", i, h_A[i], h_B[i], h_C[i]);
    53             getchar();
    54             return 1;
    55         }
    56     }
    57 
    58     free(h_A);
    59     free(h_B);
    60     free(h_C);
    61     cuMemFree(d_A);
    62     cuMemFree(d_B);
    63     cuMemFree(d_C);
    64     printf("
    	Finish.
    ");
    65     getchar();
    66     return 0;
    67 }

    ● 输出结果:

            Start.
    > Using CUDA Device [0]: GeForce GTX 1070
    > GPU Device has SM 6.1 compute capability
    
            Finish.

    ▶ 源代码,Driver API,也需要上面的 vectorAdd_kernel.cu,调用核函数有三种方式,中间那种有点问题,结果不对

      1 #include <stdio.h>
      2 #include <helper_cuda.h>
      3 #include <cuda.h>
      4 #include <string>
      5 #include <drvapi_error_string.h>
      6 
      7 #define ELEMENT 50000
      8 #define PATH "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v9.1\0_Simple\vectorAddDrv\data\"
      9 
     10 #if defined(_WIN64) || defined(__LP64__)
     11 #define PTX_FILE "vectorAdd_kernel64.ptx"
     12 #else
     13 #define PTX_FILE "vectorAdd_kernel32.ptx"
     14 #endif
     15 
     16 using namespace std;
     17 
     18 void RandomInit(float *data, int n)
     19 {
     20     for (int i = 0; i < n; ++i)
     21         data[i] = rand() / (float)RAND_MAX;
     22 }
     23 
     24 int main(int argc, char **argv)
     25 {
     26     printf("
    	Start.
    ");
     27     cuInit(0);// 相当于 runtime API 的 cudaSetDevice(0);,要先初始化设备才能创建上下文
     28     CUcontext cuContext;
     29     cuCtxCreate(&cuContext, 0, 0);
     30 
     31     // 编译
     32     string module_path, ptx_source;
     33     module_path = PATH"vectorAdd_kernel64.ptx";
     34     FILE *fp = fopen(module_path.c_str(), "rb");
     35     fseek(fp, 0, SEEK_END);
     36     int file_size = ftell(fp);
     37     char *buf = new char[file_size + 1];
     38     fseek(fp, 0, SEEK_SET);
     39     fread(buf, sizeof(char), file_size, fp);
     40     fclose(fp);
     41     buf[file_size] = '';
     42     ptx_source = buf;
     43     delete[] buf;
     44 
     45     CUmodule cuModule;
     46     if (module_path.rfind("ptx") != string::npos)// 使用的是.ptx,需要运行时编译
     47     {
     48         // 设定编译参数,CUjit_option 放置参数名,jitOptVals 放置参数值
     49         const unsigned int jitNumOptions = 3;
     50         CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
     51         void **jitOptVals = new void *[jitNumOptions];
     52         // 编译日志长度
     53         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
     54         int jitLogBufferSize = 1024;
     55         jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
     56         // 编译日志内容
     57         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
     58         char *jitLogBuffer = new char[jitLogBufferSize];
     59         jitOptVals[1] = jitLogBuffer;
     60         // 设定一个内核使用的寄存器数量
     61         jitOptions[2] = CU_JIT_MAX_REGISTERS;
     62         int jitRegCount = 32;
     63         jitOptVals[2] = (void *)(size_t)jitRegCount;
     64         // 编译模块
     65         cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);
     66         //printf("> PTX JIT log:
    %s
    ", jitLogBuffer);// 输出编译日志
     67         delete[] jitLogBuffer;
     68         delete[] jitOptVals;
     69         delete[] jitOptions;
     70     }
     71     else// 使用的是 .cubin,不用编译(本例中不经过这个分支)
     72         cuModuleLoad(&cuModule, module_path.c_str());
     73 
     74     CUfunction vecAdd_kernel;
     75     cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel");// 取出编译好的模块中的函数
     76 
     77     // 申请内存,开始运算
     78     int element = ELEMENT;
     79     size_t  size = ELEMENT * sizeof(float);
     80     float * h_A, *h_B, *h_C;
     81     CUdeviceptr d_A, d_B, d_C;
     82     h_A = (float *)malloc(size);
     83     h_B = (float *)malloc(size);
     84     h_C = (float *)malloc(size);
     85     RandomInit(h_A, ELEMENT);
     86     RandomInit(h_B, ELEMENT);
     87     cuMemAlloc(&d_A, size);
     88     cuMemAlloc(&d_B, size);
     89     cuMemAlloc(&d_C, size);
     90     cuMemcpyHtoD(d_A, h_A, size);
     91     cuMemcpyHtoD(d_B, h_B, size);
     92 
     93     int threadsPerBlock = 256;
     94     int blocksPerGrid = (ELEMENT + threadsPerBlock - 1) / threadsPerBlock;
     95     if (1)      // 三种调用 Driver API 的方式
     96     {
     97         void *args[] = { &d_A, &d_B, &d_C, &element };
     98         cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL);
     99     }
    100     else if (1) // 有问题
    101     {
    102         int offset = 0;
    103         void *argBuffer[64];
    104         *((CUdeviceptr *)&argBuffer[offset]) = d_A;
    105         offset += sizeof(d_A);
    106         *((CUdeviceptr *)&argBuffer[offset]) = d_B;
    107         offset += sizeof(d_B);
    108         *((CUdeviceptr *)&argBuffer[offset]) = d_C;
    109         offset += sizeof(d_C);
    110         *((int *)&argBuffer[offset]) = element;
    111         offset += sizeof(element);
    112         cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, argBuffer);
    113     }
    114     else        // 正确的
    115     {
    116         int offset = 0;
    117         char argBuffer[256];
    118         *((CUdeviceptr *)&argBuffer[offset]) = d_A;
    119         offset += sizeof(d_A);
    120         *((CUdeviceptr *)&argBuffer[offset]) = d_B;
    121         offset += sizeof(d_B);
    122         *((CUdeviceptr *)&argBuffer[offset]) = d_C;
    123         offset += sizeof(d_C);
    124         *((int *)&argBuffer[offset]) = element;
    125         offset += sizeof(element);
    126         void *kernel_launch_config[5] =
    127         { CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,CU_LAUNCH_PARAM_BUFFER_SIZE,&offset,CU_LAUNCH_PARAM_END };
    128         cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, (void **)&kernel_launch_config);
    129     }
    130     cuCtxSynchronize();
    131     cuMemcpyDtoH(h_C, d_C, size);
    132     int i;
    133     for (i = 0; i < ELEMENT; ++i)
    134     {
    135         float sum = h_A[i] + h_B[i];
    136         if (fabs(h_C[i] - sum) > 1e-7f)
    137         {
    138             printf("Error at i == %d, h_C[i] == %f, sum == %f", i, h_C[i], sum);
    139             break;
    140         }            
    141     }
    142     printf("
    	Finish: %s
    ", (i == ELEMENT) ? "Pass" : "Fail");
    143     getchar();
    144     return 0;
    145 }

    ● 输出结果

        Start.
    
        Finish.

    ▶ 涨姿势:

    ● 从源代码中删减了的部分

     1 CUresult CleanupNoFailure() //检查内存错误的函数
     2 {
     3     CUresult error;
     4     // Free device memory
     5     if (d_A)    
     6         error = cuMemFree(d_A);
     7     if (d_B)
     8         error = cuMemFree(d_B);
     9     if (d_C)
    10         error = cuMemFree(d_C);
    11     // Free host memory
    12     if (h_A)
    13         free(h_A);
    14     if (h_B)
    15         free(h_B);
    16     if (h_C)
    17         free(h_C);
    18     error = cuCtxDestroy(cuContext);
    19     return error;
    20 }
    21 
    22 void Cleanup(bool noError)  // 报告错误
    23 {
    24     CUresult error = CleanupNoFailure();
    25     if (!noError || error != CUDA_SUCCESS)
    26     {
    27         printf("Function call failed
    FAILED
    ");
    28         exit(EXIT_FAILURE);
    29     }
    30     if (!noprompt)
    31     {
    32         printf("
    Press ENTER to exit...
    ");
    33         fflush(stdout);
    34         fflush(stderr);
    35         getchar();
    36     }
    37 }
    38                           
    39 if (error != CUDA_SUCCESS)  // 外部调用 cleanup
    40     Cleanup(false);
    41 
    42 if (argc > 1)               // 主函数中使用参数 -device=n 指定设备号
    43 {
    44     bool bFound = false;
    45     for (int param = 0; param < argc; param++)          // 逐个检查参数
    46     {
    47         int string_start = 0;
    48         while (argv[param][string_start] == '-')        // 跳过 "-" 号
    49             string_start++;
    50         char *string_argv = &argv[param][string_start];
    51         if (!strncmp(string_argv, "device", 6))         // 看参数是否是 device
    52         {
    53             int len = (int)strlen(string_argv);
    54             while (string_argv[len] != '=')
    55                 len--;
    56             devID = atoi(&string_argv[++len]);
    57             bFound = true;
    58         }
    59         if (bFound)
    60             break;
    61     }
    62 }
  • 相关阅读:
    堆栈平衡
    visual studio 代码片段
    查询系统所有句柄(可以筛选进程打开的全部句柄)
    nginx 代理
    vue publicPath 相对路径 绝对路径模式
    vue修改该项目浏览器顶部图片和title
    h5 端自适应显示屏宽度,更改body html字体,为rem使用做准备的js代码
    error: Unexpected console statement (noconsole) 解决办法
    flex justifycontent
    Echarts报“export ‘default‘ (imported as ‘echarts‘) was not found in ‘echarts‘错解决方法!
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/8012515.html
Copyright © 2020-2023  润新知