• 0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc


    介绍了线程束表决函数的实例(其概念介绍见 http://www.cnblogs.com/cuancuancuanhao/p/7841512.html),并在静态和运行时编译两种条件下进行使用。

    ▶ 源代码:静态

     1 // simpleVote_kernel.cuh
     2 #ifndef SIMPLEVOTE_KERNEL_CU
     3 #define SIMPLEVOTE_KERNEL_CU
     4 
     5 __global__ void voteAny(unsigned int *input, unsigned int *result)// 任意一个线程抛出非零值则函数返回非零值
     6 {
     7     int tx = threadIdx.x;
     8     int mask = 0xffffffff;
     9     result[tx] = __any_sync(mask, input[tx]);
    10 }
    11 
    12 __global__ void voteAll(unsigned int *input, unsigned int *result)// 当且仅当所有线程抛出非零值函数才返回非零值
    13 {
    14     int tx = threadIdx.x;
    15     int mask = 0xffffffff;
    16     result[tx] = __all_sync(mask, input[tx]);
    17 }
    18 
    19 __global__ void vote3(bool *info, int warp_size)// 跨线程束检查
    20 {
    21     int tx = threadIdx.x;
    22     unsigned int mask = 0xffffffff;
    23     bool *offs = info + (tx * 3);// 将每个线程指向等距间隔的元素,表明表决函数的运算结果可以进行分发
    24 
    25     // 第一组 “下标模 3 得 0” 的元素为 0,第二组和第三组 “下标模 3 得 0” 的元素为 1。“一组” 为 warp_size * 3 个元素
    26     *offs = __any_sync(mask, tx >= warp_size * 3 / 2);    
    27     // 第一组和第二组前半段 “下标模 3 得 1” 的元素为 0,第二组后半段和第三组 “下标模 3 得 1” 的元素为 1    
    28     *(offs + 1) = (tx >= warp_size * 3 / 2)? true: false;
    29     // 第一组和第二组 “下标模 3 得 2” 的元素为 0,第三组 “下标模 3 得 2” 的元素为 1         
    30     *(offs + 2) = all(tx >= warp_size * 3 / 2) ? true : false;
    31     // 最终结果应该是:
    32     //   1   2   3   4      15  16  17  18      30  31  32
    33     // 000 000 000 000 ... 000 000 000 000 ... 000 000 000 
    34     // 100 100 100 100 ... 100 100 110 110 ... 110 110 110
    35     // 111 111 111 111 ... 111 111 111 111 ... 111 111 111
    36 }
    37 #endif
      1 // simpleVoteIntrinsics.cu
      2 #include <stdio.h>
      3 #include <cuda_runtime.h>
      4 #include "device_launch_parameters.h"
      5 #include <helper_functions.h>
      6 #include <helper_cuda.h>
      7 #include "simpleVote_kernel.cuh"
      8 
      9 #define WARP_SIZE   32
     10 #define GROUP       4
     11 
     12 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)// 构建原数组,size == 8 时结果为{0,0,0,3,4,0,ffffffff,ffffffff}
     13 {
     14     for (int i = 0; i < size / 4; i++)
     15         VOTE_PATTERN[i] = 0x00000000; 
     16 
     17     for (int i = size / 4; i < size / 2; i++)
     18         VOTE_PATTERN[i] = (i & 0x01) ? i : 0; 
     19 
     20     for (int i = size / 2; i < 3 * size / 4; i++)
     21         VOTE_PATTERN[i] = (i & 0x01) ? 0 : i; 
     22 
     23     for (int i = 3 * size / 4; i < size; i++)
     24         VOTE_PATTERN[i] = 0xffffffff; 
     25 }
     26 // 数组检查函数,type == 1:把数组元素全部加起来,结果非零就报错;type == 0:把数组元素全部加起来,结果不等于 WARP_SIZE 就报错
     27 int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
     28 {
     29     int i, sum;
     30     for (sum = 0, i = start; i < end; i++)
     31         sum += h_result[i];
     32     if (type&&sum > 0 || !type&& sum != WARP_SIZE)
     33     {
     34         printf("
    	<%s>[%d - %d]:", name, start, end-1);
     35         for (i = start; i < end; i++)
     36             printf("%d,", h_result[i]);
     37         printf("
    ");
     38     }
     39     return type?(sum > 0):(sum != WARP_SIZE);
     40 }
     41 
     42 // 数组检查的中间函数,type == 1:使用(1,0,0,0)的模式调用数组检查函数;type == 0:使用(1,1,1,0)的模式调用数组检查函数
     43 int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
     44 {
     45     int error_count = 0;
     46 
     47     error_count += checkErrors(h_result, 0 * totalElement / 4, 1 * totalElement / 4, type?1:1,"Vote.Any");
     48     error_count += checkErrors(h_result, 1 * totalElement / 4, 2 * totalElement / 4, type?0:1,"Vote.Any");
     49     error_count += checkErrors(h_result, 2 * totalElement / 4, 3 * totalElement / 4, type?0:1,"Vote.Any");
     50     error_count += checkErrors(h_result, 3 * totalElement / 4, 4 * totalElement / 4, type?0:0,"Vote.Any");
     51 
     52     printf("%s
    ", !error_count ? "Passed" : "Failed");
     53     return error_count;
     54 }
     55 int checkResultsVoteKernel(bool *hinfo, int totalThread)
     56 {
     57     int i, error_count;
     58     for (i = error_count = 0; i < totalThread * 3; i++)
     59     {
     60         switch (i % 3)
     61         {
     62             case 0:
     63                 if (hinfo[i] != (i >= totalThread * 1))     // 等价于 if (i < totalThread && hinfo[i] == 0 || i >= totalThread && hinfo == 1)
     64                     error_count++;
     65                 break;
     66             case 1:
     67                 if (hinfo[i] != (i >= totalThread * 3 / 2)) // 等价于 if (i < totalThread * 3 / 2 && hinfo[i] == 0 || i >= totalThread * 3 / 2 && hinfo == 1)
     68                     error_count++;
     69                 break;
     70             case 2:
     71                 if (hinfo[i] != (i >= totalThread * 2))     // 等价于 if (i < totalThread * 2 && hinfo[i] == 0 || i >= totalThread * 2 && hinfo == 1)
     72                     error_count++;
     73                 break;
     74         }
     75     }
     76     printf("%s
    ", !error_count ? "Passed" : "Failed");
     77     return error_count;
     78 }
     79 
     80 int main()
     81 {
     82     printf("
    	Start.
    ");
     83     int totalElement;
     84     unsigned int *h_input, *h_result;
     85     unsigned int *d_input, *d_result;
     86     bool *dinfo = NULL, *hinfo = NULL;
     87     int error_count[3] = { 0, 0, 0 };
     88     cudaSetDevice(0);
     89 
     90     //使用长度为 4 个线程束的数组,刚好分为 4 个组(全零,后交替非零,前交替非零,全非零)进行表决
     91     totalElement = WARP_SIZE * GROUP;
     92     h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int)); 
     93     h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
     94     cudaMalloc((void **)&d_input, totalElement * sizeof(unsigned int)); 
     95     cudaMalloc((void **)&d_result, totalElement * sizeof(unsigned int));
     96     genVoteTestPattern(h_input, totalElement); 
     97     cudaMemcpy(d_input, h_input, totalElement * sizeof(unsigned int), cudaMemcpyHostToDevice);
     98 
     99     //测试一,any
    100     printf("
    	Test 1: ");
    101     voteAny << <dim3(1, 1), dim3(totalElement, 1) >> > (d_input, d_result); 
    102     cudaDeviceSynchronize();
    103     cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);    
    104     error_count[0] += checkResultsVoteKernel(h_result, totalElement, 1);
    105 
    106     // 测试二,all
    107     printf("
    	Test 2: ");
    108     voteAll << <dim3(1, 1), dim3(totalElement, 1) >> > (d_input, d_result);
    109     cudaDeviceSynchronize();
    110     cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);
    111     error_count[1] += checkResultsVoteKernel(h_result, totalElement, 0);
    112 
    113     // 测试三,使用长度为 9 个线程束的数组,但调用内核时只使用数量为 3 个线程束的线程,即分为 3 组,每组 WARP_SIZE * 3 个元素
    114     printf("
    	Test 3: ");
    115     totalElement = WARP_SIZE * 3 * 3;
    116     hinfo = (bool *)calloc(totalElement, sizeof(bool));
    117     cudaMalloc((void **)&dinfo, totalElement * sizeof(bool));
    118     cudaMemcpy(dinfo, hinfo, totalElement * sizeof(bool), cudaMemcpyHostToDevice);
    119     vote3 << <1, totalElement / 3 >> > (dinfo, WARP_SIZE); 
    120     cudaDeviceSynchronize(); 
    121     cudaMemcpy(hinfo, dinfo, totalElement * sizeof(bool), cudaMemcpyDeviceToHost);
    122     error_count[2] = checkResultsVoteKernel(hinfo, totalElement / 3);
    123 
    124     // 清理工作
    125     cudaFree(d_input);
    126     cudaFree(d_result);
    127     free(h_input);
    128     free(h_result);
    129     free(hinfo);
    130     cudaFree(dinfo);
    131     printf("	
    Finish.
    ");
    132     getchar();
    133     return (error_count[0] || error_count[1] || error_count[2]) ? EXIT_FAILURE : EXIT_SUCCESS;
    134 }

    ▶ 输出结果:

        Start.
    
        Test 1: Passed
    
        Test 2: Passed
    
        Test 3: Passed
    
        Finish.

    ▶ 源代码:运行时编译(删掉了相同的注释)

     1 // simpleVote_kernel.cuh
     2 #ifndef SIMPLEVOTE_KERNEL_CU
     3 #define SIMPLEVOTE_KERNEL_CU
     4 
     5 extern "C" __global__ void voteAny(unsigned int *input, unsigned int *result)
     6 {
     7     int tx = threadIdx.x;
     8     int mask = 0xffffffff;
     9     result[tx] = __any_sync(mask, input[tx]);
    10 }
    11 
    12 extern "C" __global__ void voteAll(unsigned int *input, unsigned int *result)
    13 {
    14     int tx = threadIdx.x;
    15     int mask = 0xffffffff;
    16     result[tx] = __all_sync(mask, input[tx]);
    17 }
    18 
    19 extern "C" __global__ void vote3(bool *info, int warp_size)
    20 {
    21     int tx = threadIdx.x;
    22     unsigned int mask = 0xffffffff;
    23     bool *offs = info + (tx * 3);
    24     *offs = __any_sync(mask, tx >= warp_size * 3 / 2);    
    25     *(offs + 1) = (tx >= warp_size * 3 / 2) ? true : false;        
    26     *(offs + 2) = all(tx >= warp_size * 3 / 2) ? true : false;
    27 }
    28 #endif
      1 // simpleVoteIntrinsics.cu
      2 #include <stdio.h>
      3 #include <cuda_runtime.h>
      4 #include "device_launch_parameters.h"
      5 #include "nvrtc_helper.h"
      6 #include <helper_functions.h>
      7 
      8 #define WARP_SIZE   32
      9 #define GROUP       4
     10 
     11 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
     12 {
     13     for (int i = 0; i < size / 4; i++)
     14         VOTE_PATTERN[i] = 0x00000000;
     15 
     16     for (int i = size / 4; i < size / 2; i++)
     17         VOTE_PATTERN[i] = (i & 0x01) ? i : 0;
     18 
     19     for (int i = size / 2; i < 3 * size / 4; i++)
     20         VOTE_PATTERN[i] = (i & 0x01) ? 0 : i;
     21 
     22     for (int i = 3 * size / 4; i < size; i++)
     23         VOTE_PATTERN[i] = 0xffffffff;
     24 }
     25 
     26 int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
     27 {
     28     int i, sum;
     29     for (sum = 0, i = start; i < end; i++)
     30         sum += h_result[i];
     31     if (type&&sum > 0 || !type&& sum != WARP_SIZE)
     32     {
     33         printf("
    	<%s>[%d - %d]:", name, start, end - 1);
     34         for (i = start; i < end; i++)
     35             printf("%d,", h_result[i]);
     36         printf("
    ");
     37     }
     38     return type ? (sum > 0) : (sum != WARP_SIZE);
     39 }
     40 
     41 int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
     42 {
     43     int error_count = 0;
     44 
     45     error_count += checkErrors(h_result, 0 * totalElement / 4, 1 * totalElement / 4, type ? 1 : 1, "Vote.Any");
     46     error_count += checkErrors(h_result, 1 * totalElement / 4, 2 * totalElement / 4, type ? 0 : 1, "Vote.Any");
     47     error_count += checkErrors(h_result, 2 * totalElement / 4, 3 * totalElement / 4, type ? 0 : 1, "Vote.Any");
     48     error_count += checkErrors(h_result, 3 * totalElement / 4, 4 * totalElement / 4, type ? 0 : 0, "Vote.Any");
     49 
     50     printf("%s
    ", !error_count ? "Passed" : "Failed");
     51     return error_count;
     52 }
     53 int checkResultsVoteKernel(bool *hinfo, int totalThread)
     54 {
     55     int i, error_count;
     56     for (i = error_count = 0; i < totalThread * 3; i++)
     57     {
     58         switch (i % 3)
     59         {
     60         case 0:
     61             if (hinfo[i] != (i >= totalThread * 1))
     62                 error_count++;
     63             break;
     64         case 1:
     65             if (hinfo[i] != (i >= totalThread * 3 / 2))
     66                 error_count++;
     67             break;
     68         case 2:
     69             if (hinfo[i] != (i >= totalThread * 2))
     70                 error_count++;
     71             break;
     72         }
     73     }
     74     printf("%s
    ", !error_count ? "Passed" : "Failed");
     75     return error_count;
     76 }
     77 
     78 int main()
     79 {
     80     printf("
    	Start.
    ");
     81     int totalElement;
     82     unsigned int *h_input, *h_result;
     83     CUdeviceptr d_input, d_result;// unsigned long long
     84     bool *hinfo = NULL;
     85     CUdeviceptr dinfo;
     86     int error_count[3] = { 0, 0, 0 };
     87     //cudaSetDevice(0); 
     88 
     89     // 编译 PTX
     90     char *ptx, *kernel_file;
     91     size_t ptxSize; 
     92     kernel_file = "D:\Program\CUDA9.0\Samples\0_Simple\simpleVoteIntrinsics_nvrtc\simpleVote_kernel.cuh";
     93     compileFileToPTX(kernel_file, 1, NULL, &ptx, &ptxSize, 0);// (1, NULL) 为主函数接受的参数个数和参数
     94     CUmodule module = loadPTX(ptx, 1, NULL);
     95 
     96     totalElement = WARP_SIZE * GROUP;
     97     h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
     98     h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
     99     cuMemAlloc(&d_input, totalElement * sizeof(unsigned int));
    100     cuMemAlloc(&d_result, totalElement * sizeof(unsigned int));
    101     genVoteTestPattern(h_input, totalElement);
    102     cuMemcpyHtoD(d_input, h_input, totalElement * sizeof(unsigned int));
    103 
    104     //测试一,any
    105     printf("
    	Test 1: ");
    106     dim3 gridBlock(1, 1);
    107     dim3 threadBlock(totalElement, 1);
    108     CUfunction kernel_addr;
    109     cuModuleGetFunction(&kernel_addr, module, "voteAny");
    110     void *arr1[] = { (void *)&d_input, (void *)&d_result };
    111     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, 0, 0, &arr1[0], 0);
    112     cuCtxSynchronize();
    113     cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
    114     error_count[0] += checkResultsVoteKernel(h_result, totalElement, 1);
    115 
    116     // 测试二,all
    117     printf("
    	Test 2: ");
    118     cuModuleGetFunction(&kernel_addr, module, "voteAll");
    119     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, 0, 0, &arr1[0], 0);
    120     cuCtxSynchronize();
    121     cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
    122     error_count[1] += checkResultsVoteKernel(h_result, totalElement, 0);
    123 
    124     // 测试三
    125     printf("
    	Test 3: ");
    126     totalElement = WARP_SIZE * 3 * 3;
    127     hinfo = (bool *)calloc(totalElement, sizeof(bool));
    128     cuMemAlloc(&dinfo, totalElement * sizeof(bool));
    129     cuMemcpyHtoD(dinfo, hinfo, totalElement * sizeof(bool));
    130     threadBlock = dim3(totalElement / 3, 1);                    // 改变线程块尺寸
    131     cuModuleGetFunction(&kernel_addr, module, "vote3");
    132     int size = WARP_SIZE;
    133     void *arr2[] = { (void *)&dinfo, (void *)&size };
    134     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, 0, 0, &arr2[0], 0);
    135     cuCtxSynchronize();
    136     cuMemcpyDtoH(hinfo, dinfo, totalElement * sizeof(bool));
    137     error_count[2] = checkResultsVoteKernel(hinfo, totalElement / 3);
    138 
    139     // 清理工作
    140     cuMemFree(d_input);
    141     cuMemFree(d_result);
    142     free(h_input);
    143     free(h_result);
    144     free(hinfo);
    145     cuMemFree(dinfo);
    146     printf("	
    Finish.
    ");
    147     getchar();
    148     return (error_count[0] || error_count[1] || error_count[2]) ? EXIT_FAILURE : EXIT_SUCCESS;
    149 }

    ▶ 输出结果:

            Start.
    > Using CUDA Device [0]: GeForce GTX 1070
    > GPU Device has SM 6.1 compute capability
    
            Test 1: Passed
    
            Test 2: Passed
    
            Test 3: Passed
    
    Finish.

    ▶ 涨姿势

    ● 线程表决函数见另一篇博客,注意 CUDA9.0 改进了部分函数,废弃了旧的部分函数。

  • 相关阅读:
    docker(六):Dockerfile详解
    docker(五):存储卷管理
    安装VCenter 6.7
    VMware Workstation 不可恢复错误:(vmui) VERIFY bora
    HP DL360 G7安装esxi 6.0
    zabbix 4.0故障归纳
    docker(四):容器虚拟化网络概述
    安装VCenter 6.7时报错“系统未安装通用C”
    zookeeper查看日志
    zabbix(一):zabbix 4.0安装
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/8007759.html
Copyright © 2020-2023  润新知