• 0_Simple__inlinePTX + 0_Simple__inlinePTX_nvrtc


    在核函数代码中加入并行线程执行(Parallel Thread eXecution,PTX),通过汇编指令获取得有关线程束的信息。并且在静态代码和运行时编译两种条件下使用。

    ▶ 源代码:静态使用

     1 #include <stdio.h>
     2 #include <assert.h>
     3 #include <cuda_runtime.h>
     4 #include "device_launch_parameters.h"
     5 #include <helper_functions.h>
     6 #include <helper_cuda.h>
     7 
     8 __global__ void sequence_gpu(int *d_ptr, int length)
     9 {
    10     int elemID = blockIdx.x * blockDim.x + threadIdx.x;
    11 
    12     if (elemID < length)
    13     {
    14         unsigned int laneid;
    15         asm("mov.u32 %0, %%laneid;" : "=r"(laneid));// 获取当前线程在线程束中的编号
    16         d_ptr[elemID] = laneid;
    17     }
    18 }
    19 
    20 void sequence_cpu(int *h_ptr, int length)
    21 {
    22     for (int elemID=0; elemID<length; elemID++)
    23         h_ptr[elemID] = elemID % 32;
    24 }
    25 
    26 int main(int argc, char **argv)
    27 {
    28     printf("CUDA inline PTX assembler sample
    ");
    29 
    30     const int N = 1000;
    31 
    32     int dev = findCudaDevice(argc, (const char **) argv);
    33     if (dev == -1)
    34         return EXIT_FAILURE;
    35 
    36     int *d_ptr;
    37     cudaMalloc(&d_ptr, N * sizeof(int));
    38     int *h_ptr;
    39     cudaMallocHost(&h_ptr, N * sizeof(int));
    40 
    41     dim3 cudaBlockSize(256,1,1);
    42     dim3 cudaGridSize((N + cudaBlockSize.x - 1) / cudaBlockSize.x, 1, 1);
    43     sequence_gpu<<<cudaGridSize, cudaBlockSize>>>(d_ptr, N);
    44     cudaGetLastError();
    45     cudaDeviceSynchronize();
    46 
    47     sequence_cpu(h_ptr, N);
    48 
    49     int *h_d_ptr;
    50     cudaMallocHost(&h_d_ptr, N *sizeof(int));
    51     cudaMemcpy(h_d_ptr, d_ptr, N *sizeof(int), cudaMemcpyDeviceToHost);
    52 
    53     bool bValid = true;
    54 
    55     for (int i=0; i<N && bValid; i++)
    56     {
    57         if (h_ptr[i] != h_d_ptr[i])
    58             bValid = false;
    59     }
    60 
    61     printf("Test %s.
    ", bValid ? "Successful" : "Failed");
    62 
    63     cudaFree(d_ptr);
    64     cudaFreeHost(h_ptr);
    65     cudaFreeHost(h_d_ptr);
    66 
    67     getchar();
    68     return bValid ? EXIT_SUCCESS: EXIT_FAILURE;
    69 }

    ▶ 源代码:运行时编译

     1 /*inlinePTX_kernel.cu*/
     2 extern "C" __global__ void sequence_gpu(int *d_ptr, int length) 
     3 { 
     4     int elemID = blockIdx.x * blockDim.x + threadIdx.x; 
     5     if (elemID < length)
     6     {
     7         unsigned int laneid;
     8         asm("mov.u32 %0, %%laneid;" : "=r"(laneid));
     9         d_ptr[elemID] = laneid;
    10     }
    11 }
     1 /*inlinePTX.cpp*/
     2 #include <stdio.h>
     3 #include <assert.h>
     4 #include <cuda_runtime.h>
     5 #include <nvrtc_helper.h>
     6 #include <helper_functions.h>
     7 
     8 void sequence_cpu(int *h_ptr, int length)
     9 {
    10     for (int elemID=0; elemID<length; elemID++)
    11         h_ptr[elemID] = elemID % 32;
    12 }
    13 
    14 int main(int argc, char **argv)
    15 {
    16     printf("CUDA inline PTX assembler sample
    ");
    17 
    18     char *ptx, *kernel_file;
    19     size_t ptxSize;
    20 
    21     kernel_file = sdkFindFilePath("inlinePTX_kernel.cu", argv[0]);
    22     compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize);
    23     CUmodule module = loadPTX(ptx, argc, argv);
    24     CUfunction kernel_addr;
    25     cuModuleGetFunction(&kernel_addr, module, "sequence_gpu");
    26 
    27     const int N = 1000;
    28     int *h_ptr = (int *)malloc(N * sizeof(int));
    29 
    30     dim3 cudaBlockSize(256,1,1);
    31     dim3 cudaGridSize((N + cudaBlockSize.x - 1) / cudaBlockSize.x, 1, 1);
    32     CUdeviceptr d_ptr;
    33     cuMemAlloc(&d_ptr, N * sizeof(int));
    34 
    35     void *arr[] = { (void *)&d_ptr, (void *)&N };
    36     cuLaunchKernel(kernel_addr,
    37                    cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, 
    38                    cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z,
    39                    0, 0, &arr[0], 0);
    40 
    41     cuCtxSynchronize();
    42     sequence_cpu(h_ptr, N);
    43     int *h_d_ptr = (int *)malloc(N * sizeof(int));;
    44     cuMemcpyDtoH(h_d_ptr, d_ptr, N *sizeof(int));
    45 
    46     bool bValid = true;
    47     for (int i=0; i<N && bValid; i++)
    48     {
    49         if (h_ptr[i] != h_d_ptr[i])
    50             bValid = false;
    51     }
    52 
    53     printf("Test %s.
    ", bValid ? "Successful" : "Failed");
    54     cuMemFree(d_ptr);
    55 
    56     getchar();
    57     return bValid ? EXIT_SUCCESS: EXIT_FAILURE;
    58 }

    ▶ 输出结果:

    CUDA inline PTX assembler sample
    GPU Device 0: "GeForce GTX 1070" with compute capability 6.1
    
    Test Successful.

    ▶ 涨姿势:

    ● 获取当前线程在线程束中的编号,即同意先乘数中的线程分别获得值 0 ~ 31

    asm("mov.u32 %0, %%laneid;" : "=r"(laneid));
  • 相关阅读:
    从输入url到页面加载完成都发生了什么?
    JS 获取和监听屏幕方向变化(portrait / landscape)
    圣杯布局的几种情况
    闭包的使用 — 点击列表项输出项目索引
    论文笔记:dropout
    论文笔记:蒸馏网络(Distilling the Knowledge in Neural Network)
    ng-深度学习-课程笔记-17: 序列模型和注意力机制(Week3)
    ng-深度学习-课程笔记-16: 自然语言处理与词嵌入(Week2)
    ng-深度学习-课程笔记-15: 循环序列模型(Week1)
    Ubuntu安装dlib后import出现libstdc++.so.6: version `GLIBCXX_3.4.21' not found
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7744939.html
Copyright © 2020-2023  润新知