介绍了线程束表决函数的实例(其概念介绍见 http://www.cnblogs.com/cuancuancuanhao/p/7841512.html),并在静态和运行时编译两种条件下进行使用。
▶ 源代码:静态
1 // simpleVote_kernel.cuh 2 #ifndef SIMPLEVOTE_KERNEL_CU 3 #define SIMPLEVOTE_KERNEL_CU 4 5 __global__ void voteAny(unsigned int *input, unsigned int *result)// 任意一个线程抛出非零值则函数返回非零值 6 { 7 int tx = threadIdx.x; 8 int mask = 0xffffffff; 9 result[tx] = __any_sync(mask, input[tx]); 10 } 11 12 __global__ void voteAll(unsigned int *input, unsigned int *result)// 当且仅当所有线程抛出非零值函数才返回非零值 13 { 14 int tx = threadIdx.x; 15 int mask = 0xffffffff; 16 result[tx] = __all_sync(mask, input[tx]); 17 } 18 19 __global__ void vote3(bool *info, int warp_size)// 跨线程束检查 20 { 21 int tx = threadIdx.x; 22 unsigned int mask = 0xffffffff; 23 bool *offs = info + (tx * 3);// 将每个线程指向等距间隔的元素,表明表决函数的运算结果可以进行分发 24 25 // 第一组 “下标模 3 得 0” 的元素为 0,第二组和第三组 “下标模 3 得 0” 的元素为 1。“一组” 为 warp_size * 3 个元素 26 *offs = __any_sync(mask, tx >= warp_size * 3 / 2); 27 // 第一组和第二组前半段 “下标模 3 得 1” 的元素为 0,第二组后半段和第三组 “下标模 3 得 1” 的元素为 1 28 *(offs + 1) = (tx >= warp_size * 3 / 2)? true: false; 29 // 第一组和第二组 “下标模 3 得 2” 的元素为 0,第三组 “下标模 3 得 2” 的元素为 1 30 *(offs + 2) = all(tx >= warp_size * 3 / 2) ? true : false; 31 // 最终结果应该是: 32 // 1 2 3 4 15 16 17 18 30 31 32 33 // 000 000 000 000 ... 000 000 000 000 ... 000 000 000 34 // 100 100 100 100 ... 100 100 110 110 ... 110 110 110 35 // 111 111 111 111 ... 111 111 111 111 ... 111 111 111 36 } 37 #endif
1 // simpleVoteIntrinsics.cu 2 #include <stdio.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <helper_functions.h> 6 #include <helper_cuda.h> 7 #include "simpleVote_kernel.cuh" 8 9 #define WARP_SIZE 32 10 #define GROUP 4 11 12 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)// 构建原数组,size == 8 时结果为{0,0,0,3,4,0,ffffffff,ffffffff} 13 { 14 for (int i = 0; i < size / 4; i++) 15 VOTE_PATTERN[i] = 0x00000000; 16 17 for (int i = size / 4; i < size / 2; i++) 18 VOTE_PATTERN[i] = (i & 0x01) ? i : 0; 19 20 for (int i = size / 2; i < 3 * size / 4; i++) 21 VOTE_PATTERN[i] = (i & 0x01) ? 0 : i; 22 23 for (int i = 3 * size / 4; i < size; i++) 24 VOTE_PATTERN[i] = 0xffffffff; 25 } 26 // 数组检查函数,type == 1:把数组元素全部加起来,结果非零就报错;type == 0:把数组元素全部加起来,结果不等于 WARP_SIZE 就报错 27 int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name) 28 { 29 int i, sum; 30 for (sum = 0, i = start; i < end; i++) 31 sum += h_result[i]; 32 if (type&&sum > 0 || !type&& sum != WARP_SIZE) 33 { 34 printf(" <%s>[%d - %d]:", name, start, end-1); 35 for (i = start; i < end; i++) 36 printf("%d,", h_result[i]); 37 printf(" "); 38 } 39 return type?(sum > 0):(sum != WARP_SIZE); 40 } 41 42 // 数组检查的中间函数,type == 1:使用(1,0,0,0)的模式调用数组检查函数;type == 0:使用(1,1,1,0)的模式调用数组检查函数 43 int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type) 44 { 45 int error_count = 0; 46 47 error_count += checkErrors(h_result, 0 * totalElement / 4, 1 * totalElement / 4, type?1:1,"Vote.Any"); 48 error_count += checkErrors(h_result, 1 * totalElement / 4, 2 * totalElement / 4, type?0:1,"Vote.Any"); 49 error_count += checkErrors(h_result, 2 * totalElement / 4, 3 * totalElement / 4, type?0:1,"Vote.Any"); 50 error_count += checkErrors(h_result, 3 * totalElement / 4, 4 * totalElement / 4, type?0:0,"Vote.Any"); 51 52 printf("%s ", !error_count ? "Passed" : "Failed"); 53 return error_count; 54 } 55 int checkResultsVoteKernel(bool *hinfo, int totalThread) 56 { 57 int i, error_count; 58 for (i = error_count = 0; i < totalThread * 3; i++) 59 { 60 switch (i % 3) 61 { 62 case 0: 63 if (hinfo[i] != (i >= totalThread * 1)) // 等价于 if (i < totalThread && hinfo[i] == 0 || i >= totalThread && hinfo == 1) 64 error_count++; 65 break; 66 case 1: 67 if (hinfo[i] != (i >= totalThread * 3 / 2)) // 等价于 if (i < totalThread * 3 / 2 && hinfo[i] == 0 || i >= totalThread * 3 / 2 && hinfo == 1) 68 error_count++; 69 break; 70 case 2: 71 if (hinfo[i] != (i >= totalThread * 2)) // 等价于 if (i < totalThread * 2 && hinfo[i] == 0 || i >= totalThread * 2 && hinfo == 1) 72 error_count++; 73 break; 74 } 75 } 76 printf("%s ", !error_count ? "Passed" : "Failed"); 77 return error_count; 78 } 79 80 int main() 81 { 82 printf(" Start. "); 83 int totalElement; 84 unsigned int *h_input, *h_result; 85 unsigned int *d_input, *d_result; 86 bool *dinfo = NULL, *hinfo = NULL; 87 int error_count[3] = { 0, 0, 0 }; 88 cudaSetDevice(0); 89 90 //使用长度为 4 个线程束的数组,刚好分为 4 个组(全零,后交替非零,前交替非零,全非零)进行表决 91 totalElement = WARP_SIZE * GROUP; 92 h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int)); 93 h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int)); 94 cudaMalloc((void **)&d_input, totalElement * sizeof(unsigned int)); 95 cudaMalloc((void **)&d_result, totalElement * sizeof(unsigned int)); 96 genVoteTestPattern(h_input, totalElement); 97 cudaMemcpy(d_input, h_input, totalElement * sizeof(unsigned int), cudaMemcpyHostToDevice); 98 99 //测试一,any 100 printf(" Test 1: "); 101 voteAny << <dim3(1, 1), dim3(totalElement, 1) >> > (d_input, d_result); 102 cudaDeviceSynchronize(); 103 cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost); 104 error_count[0] += checkResultsVoteKernel(h_result, totalElement, 1); 105 106 // 测试二,all 107 printf(" Test 2: "); 108 voteAll << <dim3(1, 1), dim3(totalElement, 1) >> > (d_input, d_result); 109 cudaDeviceSynchronize(); 110 cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost); 111 error_count[1] += checkResultsVoteKernel(h_result, totalElement, 0); 112 113 // 测试三,使用长度为 9 个线程束的数组,但调用内核时只使用数量为 3 个线程束的线程,即分为 3 组,每组 WARP_SIZE * 3 个元素 114 printf(" Test 3: "); 115 totalElement = WARP_SIZE * 3 * 3; 116 hinfo = (bool *)calloc(totalElement, sizeof(bool)); 117 cudaMalloc((void **)&dinfo, totalElement * sizeof(bool)); 118 cudaMemcpy(dinfo, hinfo, totalElement * sizeof(bool), cudaMemcpyHostToDevice); 119 vote3 << <1, totalElement / 3 >> > (dinfo, WARP_SIZE); 120 cudaDeviceSynchronize(); 121 cudaMemcpy(hinfo, dinfo, totalElement * sizeof(bool), cudaMemcpyDeviceToHost); 122 error_count[2] = checkResultsVoteKernel(hinfo, totalElement / 3); 123 124 // 清理工作 125 cudaFree(d_input); 126 cudaFree(d_result); 127 free(h_input); 128 free(h_result); 129 free(hinfo); 130 cudaFree(dinfo); 131 printf(" Finish. "); 132 getchar(); 133 return (error_count[0] || error_count[1] || error_count[2]) ? EXIT_FAILURE : EXIT_SUCCESS; 134 }
▶ 输出结果:
Start. Test 1: Passed Test 2: Passed Test 3: Passed Finish.
▶ 源代码:运行时编译(删掉了相同的注释)
1 // simpleVote_kernel.cuh 2 #ifndef SIMPLEVOTE_KERNEL_CU 3 #define SIMPLEVOTE_KERNEL_CU 4 5 extern "C" __global__ void voteAny(unsigned int *input, unsigned int *result) 6 { 7 int tx = threadIdx.x; 8 int mask = 0xffffffff; 9 result[tx] = __any_sync(mask, input[tx]); 10 } 11 12 extern "C" __global__ void voteAll(unsigned int *input, unsigned int *result) 13 { 14 int tx = threadIdx.x; 15 int mask = 0xffffffff; 16 result[tx] = __all_sync(mask, input[tx]); 17 } 18 19 extern "C" __global__ void vote3(bool *info, int warp_size) 20 { 21 int tx = threadIdx.x; 22 unsigned int mask = 0xffffffff; 23 bool *offs = info + (tx * 3); 24 *offs = __any_sync(mask, tx >= warp_size * 3 / 2); 25 *(offs + 1) = (tx >= warp_size * 3 / 2) ? true : false; 26 *(offs + 2) = all(tx >= warp_size * 3 / 2) ? true : false; 27 } 28 #endif
1 // simpleVoteIntrinsics.cu 2 #include <stdio.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include "nvrtc_helper.h" 6 #include <helper_functions.h> 7 8 #define WARP_SIZE 32 9 #define GROUP 4 10 11 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) 12 { 13 for (int i = 0; i < size / 4; i++) 14 VOTE_PATTERN[i] = 0x00000000; 15 16 for (int i = size / 4; i < size / 2; i++) 17 VOTE_PATTERN[i] = (i & 0x01) ? i : 0; 18 19 for (int i = size / 2; i < 3 * size / 4; i++) 20 VOTE_PATTERN[i] = (i & 0x01) ? 0 : i; 21 22 for (int i = 3 * size / 4; i < size; i++) 23 VOTE_PATTERN[i] = 0xffffffff; 24 } 25 26 int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name) 27 { 28 int i, sum; 29 for (sum = 0, i = start; i < end; i++) 30 sum += h_result[i]; 31 if (type&&sum > 0 || !type&& sum != WARP_SIZE) 32 { 33 printf(" <%s>[%d - %d]:", name, start, end - 1); 34 for (i = start; i < end; i++) 35 printf("%d,", h_result[i]); 36 printf(" "); 37 } 38 return type ? (sum > 0) : (sum != WARP_SIZE); 39 } 40 41 int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type) 42 { 43 int error_count = 0; 44 45 error_count += checkErrors(h_result, 0 * totalElement / 4, 1 * totalElement / 4, type ? 1 : 1, "Vote.Any"); 46 error_count += checkErrors(h_result, 1 * totalElement / 4, 2 * totalElement / 4, type ? 0 : 1, "Vote.Any"); 47 error_count += checkErrors(h_result, 2 * totalElement / 4, 3 * totalElement / 4, type ? 0 : 1, "Vote.Any"); 48 error_count += checkErrors(h_result, 3 * totalElement / 4, 4 * totalElement / 4, type ? 0 : 0, "Vote.Any"); 49 50 printf("%s ", !error_count ? "Passed" : "Failed"); 51 return error_count; 52 } 53 int checkResultsVoteKernel(bool *hinfo, int totalThread) 54 { 55 int i, error_count; 56 for (i = error_count = 0; i < totalThread * 3; i++) 57 { 58 switch (i % 3) 59 { 60 case 0: 61 if (hinfo[i] != (i >= totalThread * 1)) 62 error_count++; 63 break; 64 case 1: 65 if (hinfo[i] != (i >= totalThread * 3 / 2)) 66 error_count++; 67 break; 68 case 2: 69 if (hinfo[i] != (i >= totalThread * 2)) 70 error_count++; 71 break; 72 } 73 } 74 printf("%s ", !error_count ? "Passed" : "Failed"); 75 return error_count; 76 } 77 78 int main() 79 { 80 printf(" Start. "); 81 int totalElement; 82 unsigned int *h_input, *h_result; 83 CUdeviceptr d_input, d_result;// unsigned long long 84 bool *hinfo = NULL; 85 CUdeviceptr dinfo; 86 int error_count[3] = { 0, 0, 0 }; 87 //cudaSetDevice(0); 88 89 // 编译 PTX 90 char *ptx, *kernel_file; 91 size_t ptxSize; 92 kernel_file = "D:\Program\CUDA9.0\Samples\0_Simple\simpleVoteIntrinsics_nvrtc\simpleVote_kernel.cuh"; 93 compileFileToPTX(kernel_file, 1, NULL, &ptx, &ptxSize, 0);// (1, NULL) 为主函数接受的参数个数和参数 94 CUmodule module = loadPTX(ptx, 1, NULL); 95 96 totalElement = WARP_SIZE * GROUP; 97 h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int)); 98 h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int)); 99 cuMemAlloc(&d_input, totalElement * sizeof(unsigned int)); 100 cuMemAlloc(&d_result, totalElement * sizeof(unsigned int)); 101 genVoteTestPattern(h_input, totalElement); 102 cuMemcpyHtoD(d_input, h_input, totalElement * sizeof(unsigned int)); 103 104 //测试一,any 105 printf(" Test 1: "); 106 dim3 gridBlock(1, 1); 107 dim3 threadBlock(totalElement, 1); 108 CUfunction kernel_addr; 109 cuModuleGetFunction(&kernel_addr, module, "voteAny"); 110 void *arr1[] = { (void *)&d_input, (void *)&d_result }; 111 cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, 0, 0, &arr1[0], 0); 112 cuCtxSynchronize(); 113 cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int)); 114 error_count[0] += checkResultsVoteKernel(h_result, totalElement, 1); 115 116 // 测试二,all 117 printf(" Test 2: "); 118 cuModuleGetFunction(&kernel_addr, module, "voteAll"); 119 cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, 0, 0, &arr1[0], 0); 120 cuCtxSynchronize(); 121 cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int)); 122 error_count[1] += checkResultsVoteKernel(h_result, totalElement, 0); 123 124 // 测试三 125 printf(" Test 3: "); 126 totalElement = WARP_SIZE * 3 * 3; 127 hinfo = (bool *)calloc(totalElement, sizeof(bool)); 128 cuMemAlloc(&dinfo, totalElement * sizeof(bool)); 129 cuMemcpyHtoD(dinfo, hinfo, totalElement * sizeof(bool)); 130 threadBlock = dim3(totalElement / 3, 1); // 改变线程块尺寸 131 cuModuleGetFunction(&kernel_addr, module, "vote3"); 132 int size = WARP_SIZE; 133 void *arr2[] = { (void *)&dinfo, (void *)&size }; 134 cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, 0, 0, &arr2[0], 0); 135 cuCtxSynchronize(); 136 cuMemcpyDtoH(hinfo, dinfo, totalElement * sizeof(bool)); 137 error_count[2] = checkResultsVoteKernel(hinfo, totalElement / 3); 138 139 // 清理工作 140 cuMemFree(d_input); 141 cuMemFree(d_result); 142 free(h_input); 143 free(h_result); 144 free(hinfo); 145 cuMemFree(dinfo); 146 printf(" Finish. "); 147 getchar(); 148 return (error_count[0] || error_count[1] || error_count[2]) ? EXIT_FAILURE : EXIT_SUCCESS; 149 }
▶ 输出结果:
Start. > Using CUDA Device [0]: GeForce GTX 1070 > GPU Device has SM 6.1 compute capability Test 1: Passed Test 2: Passed Test 3: Passed Finish.
▶ 涨姿势
● 线程表决函数见另一篇博客,注意 CUDA9.0 改进了部分函数,废弃了旧的部分函数。