在核函数代码中加入并行线程执行(Parallel Thread eXecution,PTX),通过汇编指令获取得有关线程束的信息。并且在静态代码和运行时编译两种条件下使用。
▶ 源代码:静态使用
1 #include <stdio.h> 2 #include <assert.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <helper_functions.h> 6 #include <helper_cuda.h> 7 8 __global__ void sequence_gpu(int *d_ptr, int length) 9 { 10 int elemID = blockIdx.x * blockDim.x + threadIdx.x; 11 12 if (elemID < length) 13 { 14 unsigned int laneid; 15 asm("mov.u32 %0, %%laneid;" : "=r"(laneid));// 获取当前线程在线程束中的编号 16 d_ptr[elemID] = laneid; 17 } 18 } 19 20 void sequence_cpu(int *h_ptr, int length) 21 { 22 for (int elemID=0; elemID<length; elemID++) 23 h_ptr[elemID] = elemID % 32; 24 } 25 26 int main(int argc, char **argv) 27 { 28 printf("CUDA inline PTX assembler sample "); 29 30 const int N = 1000; 31 32 int dev = findCudaDevice(argc, (const char **) argv); 33 if (dev == -1) 34 return EXIT_FAILURE; 35 36 int *d_ptr; 37 cudaMalloc(&d_ptr, N * sizeof(int)); 38 int *h_ptr; 39 cudaMallocHost(&h_ptr, N * sizeof(int)); 40 41 dim3 cudaBlockSize(256,1,1); 42 dim3 cudaGridSize((N + cudaBlockSize.x - 1) / cudaBlockSize.x, 1, 1); 43 sequence_gpu<<<cudaGridSize, cudaBlockSize>>>(d_ptr, N); 44 cudaGetLastError(); 45 cudaDeviceSynchronize(); 46 47 sequence_cpu(h_ptr, N); 48 49 int *h_d_ptr; 50 cudaMallocHost(&h_d_ptr, N *sizeof(int)); 51 cudaMemcpy(h_d_ptr, d_ptr, N *sizeof(int), cudaMemcpyDeviceToHost); 52 53 bool bValid = true; 54 55 for (int i=0; i<N && bValid; i++) 56 { 57 if (h_ptr[i] != h_d_ptr[i]) 58 bValid = false; 59 } 60 61 printf("Test %s. ", bValid ? "Successful" : "Failed"); 62 63 cudaFree(d_ptr); 64 cudaFreeHost(h_ptr); 65 cudaFreeHost(h_d_ptr); 66 67 getchar(); 68 return bValid ? EXIT_SUCCESS: EXIT_FAILURE; 69 }
▶ 源代码:运行时编译
1 /*inlinePTX_kernel.cu*/ 2 extern "C" __global__ void sequence_gpu(int *d_ptr, int length) 3 { 4 int elemID = blockIdx.x * blockDim.x + threadIdx.x; 5 if (elemID < length) 6 { 7 unsigned int laneid; 8 asm("mov.u32 %0, %%laneid;" : "=r"(laneid)); 9 d_ptr[elemID] = laneid; 10 } 11 }
1 /*inlinePTX.cpp*/ 2 #include <stdio.h> 3 #include <assert.h> 4 #include <cuda_runtime.h> 5 #include <nvrtc_helper.h> 6 #include <helper_functions.h> 7 8 void sequence_cpu(int *h_ptr, int length) 9 { 10 for (int elemID=0; elemID<length; elemID++) 11 h_ptr[elemID] = elemID % 32; 12 } 13 14 int main(int argc, char **argv) 15 { 16 printf("CUDA inline PTX assembler sample "); 17 18 char *ptx, *kernel_file; 19 size_t ptxSize; 20 21 kernel_file = sdkFindFilePath("inlinePTX_kernel.cu", argv[0]); 22 compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize); 23 CUmodule module = loadPTX(ptx, argc, argv); 24 CUfunction kernel_addr; 25 cuModuleGetFunction(&kernel_addr, module, "sequence_gpu"); 26 27 const int N = 1000; 28 int *h_ptr = (int *)malloc(N * sizeof(int)); 29 30 dim3 cudaBlockSize(256,1,1); 31 dim3 cudaGridSize((N + cudaBlockSize.x - 1) / cudaBlockSize.x, 1, 1); 32 CUdeviceptr d_ptr; 33 cuMemAlloc(&d_ptr, N * sizeof(int)); 34 35 void *arr[] = { (void *)&d_ptr, (void *)&N }; 36 cuLaunchKernel(kernel_addr, 37 cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, 38 cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, 39 0, 0, &arr[0], 0); 40 41 cuCtxSynchronize(); 42 sequence_cpu(h_ptr, N); 43 int *h_d_ptr = (int *)malloc(N * sizeof(int));; 44 cuMemcpyDtoH(h_d_ptr, d_ptr, N *sizeof(int)); 45 46 bool bValid = true; 47 for (int i=0; i<N && bValid; i++) 48 { 49 if (h_ptr[i] != h_d_ptr[i]) 50 bValid = false; 51 } 52 53 printf("Test %s. ", bValid ? "Successful" : "Failed"); 54 cuMemFree(d_ptr); 55 56 getchar(); 57 return bValid ? EXIT_SUCCESS: EXIT_FAILURE; 58 }
▶ 输出结果:
CUDA inline PTX assembler sample GPU Device 0: "GeForce GTX 1070" with compute capability 6.1 Test Successful.
▶ 涨姿势:
● 获取当前线程在线程束中的编号,即同意先乘数中的线程分别获得值 0 ~ 31
asm("mov.u32 %0, %%laneid;" : "=r"(laneid));