• 0_Simple__simpleMultiCopy


    利用 CUDA 的 Overlap 特性同时进行运算和数据拷贝来实现加速。

    ▶ 源代码。使用 4 个流一共执行 10 次 “数据上传 - 内核计算 - 数据下载” 过程,记录使用时间。

      1 #include <stdio.h>
      2 #include <cuda_runtime.h>
      3 #include "device_launch_parameters.h"
      4 #include <helper_cuda.h>
      5 #include <helper_functions.h>
      6 
      7 #define STREAM_COUNT 4
      8 #define NREPS        10
      9 #define INNER_REPS   5
     10 
     11 int N = 1 << 22;
     12 int memsize;
     13 
     14 int *h_data_source;
     15 int *h_data_sink;
     16 int *h_data_in[STREAM_COUNT];
     17 int *h_data_out[STREAM_COUNT];
     18 int *d_data_in[STREAM_COUNT];
     19 int *d_data_out[STREAM_COUNT];
     20 
     21 dim3 grid;
     22 dim3 block(512);
     23 cudaEvent_t cycleDone[STREAM_COUNT], start, stop;
     24 cudaStream_t stream[STREAM_COUNT];
     25 
     26 __global__ void incKernel(int *g_out, int *g_in, int size)
     27 {
     28     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     29 
     30     if (idx < size)
     31     {
     32         for (int i = 0; i < INNER_REPS; ++i)// 暴力重复 5 次,不会被编译器优化掉?
     33             g_out[idx] = g_in[idx] + 1;
     34     }
     35 }
     36 
     37 float processWithStreams(int streams_used)
     38 { 
     39     cudaEventRecord(start, 0);
     40     for (int i = 0, current_stream = 0; i < NREPS; ++i)
     41     {
     42         int next_stream = (current_stream + 1) % streams_used;
     43 
     44 #ifdef SIMULATE_IO//
     45         // 改变下载数据
     46         memcpy(h_data_sink, h_data_out[current_stream], memsize);
     47 
     48         // 改变上传数据
     49         memcpy(h_data_in[next_stream], h_data_source, memsize);
     50 #endif
     51 
     52         // 保证上一次循环中位于流 next_stream 中的任务已经完成
     53         cudaEventSynchronize(cycleDone[next_stream]);
     54 
     55         // 执行当前流的内核
     56         incKernel << <grid, block, 0, stream[current_stream] >> > (d_data_out[current_stream], d_data_in[current_stream], N);
     57 
     58         // 执行下一个流的数据上传
     59         cudaMemcpyAsync(d_data_in[next_stream],h_data_in[next_stream],memsize,cudaMemcpyHostToDevice,stream[next_stream]);
     60 
     61         // 执行当前流的数据下载
     62         cudaMemcpyAsync(h_data_out[current_stream],d_data_out[current_stream],memsize,cudaMemcpyDeviceToHost,stream[current_stream]);
     63 
     64         cudaEventRecord(cycleDone[current_stream],stream[current_stream]);
     65 
     66         current_stream = next_stream;
     67     }
     68     cudaEventRecord(stop, 0);
     69     cudaDeviceSynchronize();
     70     
     71     float time;
     72     cudaEventElapsedTime(&time, start, stop);
     73     return time;
     74 }
     75 
     76 bool test()
     77 {
     78     bool passed = true;
     79     for (int j = 0; j<STREAM_COUNT; ++j)
     80     {
     81         for (int i = 0; i < N; ++i)
     82             passed &= (h_data_out[j][i] == 1);
     83     }
     84     return passed;
     85 }
     86 
     87 int main(int argc, char *argv[])
     88 {
     89     printf("
    	Start.
    "); 
     90 
     91     // 挑选设备和分析设备性能
     92     int cuda_device = 0;
     93     cudaDeviceProp deviceProp;
     94     
     95     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
     96     {
     97         if ((cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=")) < 0)
     98         {
     99             printf("Invalid command line parameters
    ");
    100             exit(EXIT_FAILURE);
    101         }
    102         else
    103         {
    104             printf("cuda_device = %d
    ", cuda_device);
    105             if ((cuda_device = gpuDeviceInit(cuda_device)) < 0)
    106             {
    107                 printf("No CUDA Capable devices found, exiting...
    ");
    108                 exit(EXIT_SUCCESS);
    109             }
    110         }
    111     }
    112     else
    113     {
    114         cuda_device = gpuGetMaxGflopsDeviceId();
    115         cudaSetDevice(cuda_device);
    116         cudaGetDeviceProperties(&deviceProp, cuda_device);
    117         printf("
    	Device [%d]: %s, computation cability %d.%d, ", cuda_device, deviceProp.name, deviceProp.major, deviceProp.minor);
    118     }
    119     cudaGetDeviceProperties(&deviceProp, cuda_device);
    120     printf("%d MP(s) x %d (Cores/MP) = %d (Cores)
    ",
    121            deviceProp.multiProcessorCount,
    122            _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),// 依计算能力反应流处理器个数情况,定义于 helper_cuda.h
    123            _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
    124 
    125     printf("
    	Relevant properties of this CUDA device.
    ");
    126     printf("	(%c) Execute several GPU kernels simultaneously
    ", deviceProp.major >= 2 ? 'Y' : 'N');
    127     printf("	(%c) Overlap one CPU<->GPU data transfer with GPU kernel execution
    ", deviceProp.deviceOverlap ? 'Y' : 'N');
    128     printf("	(%c) Overlap two CPU<->GPU data transfers with GPU kernel execution
    ",(deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1)? 'Y' : 'N');
    129 
    130     // 如果流处理器个数少于 32,则降低工作负荷
    131     float scale_factor = max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);
    132     N = (int)((float)N / scale_factor);
    133     printf("
    	scale_factor = %.2f
    	array_size   = %d
    ", 1.0f / scale_factor, N);
    134 
    135     // 准备运行配置
    136     memsize = N * sizeof(int);
    137     int thread_blocks = N / block.x;
    138     grid.x = thread_blocks % 65535;
    139     grid.y = (thread_blocks / 65535 + 1);      
    140     
    141     h_data_source = (int *) malloc(memsize);
    142     h_data_sink = (int *) malloc(memsize);
    143     for (int i = 0; i < STREAM_COUNT; ++i)
    144     {
    145         cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault); 
    146         cudaMalloc(&d_data_in[i], memsize);
    147         cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault);
    148         cudaMalloc(&d_data_out[i], memsize);
    149 
    150         cudaStreamCreate(&stream[i]);
    151         cudaEventCreate(&cycleDone[i]);
    152         cudaEventRecord(cycleDone[i], stream[i]);
    153     }
    154     cudaEventCreate(&start);
    155     cudaEventCreate(&stop);
    156 
    157     // 初始化 h_data_source 和 h_data_in    
    158     for (int i = 0; i<N; ++i)
    159         h_data_source[i] = 0;
    160     for (int i = 0; i < STREAM_COUNT; ++i)
    161         memcpy(h_data_in[i], h_data_source, memsize);
    162 
    163     // 预跑
    164     incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N);
    165 
    166     // 各种测试
    167     cudaEventRecord(start,0);
    168     cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0);
    169     cudaEventRecord(stop,0);
    170     cudaEventSynchronize(stop);
    171 
    172     float memcpy_h2d_time;
    173     cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
    174 
    175     cudaEventRecord(start,0);
    176     cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0);
    177     cudaEventRecord(stop,0);
    178     cudaEventSynchronize(stop);
    179 
    180     float memcpy_d2h_time;
    181     cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
    182 
    183     cudaEventRecord(start,0);
    184     incKernel<<<grid, block,0,0>>>(d_data_out[0], d_data_in[0], N);
    185     cudaEventRecord(stop,0);
    186     cudaEventSynchronize(stop);
    187 
    188     float kernel_time;
    189     cudaEventElapsedTime(&kernel_time, start, stop);
    190 
    191     printf("
    	Measured timings (throughput):
    ");
    192     printf("	Memcpy host to device:	%f ms (%f GB/s)
    ", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time);
    193     printf("	Memcpy device to host:	%f ms (%f GB/s)
    ", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time); 
    194     printf("	Kernel:               	%f ms (%f GB/s)
    ", kernel_time, (INNER_REPS *memsize * 2e-6) / kernel_time); 
    195 
    196     printf("
    	Theoretical limits for speedup gained from overlapped data transfers:
    "); 
    197     printf("	No overlap (transfer-kernel-transfer):	%f ms 
    ", memcpy_h2d_time + memcpy_d2h_time + kernel_time);
    198     printf("	Overlap one transfer:                 	%f ms
    ", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time)); 
    199     printf("	Overlap both data transfers:          	%f ms
    ", max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
    200 
    201     // 使用 Overlap 特性进行计算
    202     float serial_time = processWithStreams(1);
    203     float overlap_time = processWithStreams(STREAM_COUNT);
    204 
    205     printf("
    	Average measured timings over %d repetitions:
    ", NREPS);
    206     printf("	Avg. time serialized:      	%f ms (%f GB/s)
    ", serial_time / NREPS, (NREPS * (memsize * 2e-6)) / serial_time);
    207     printf("	Avg. time using %d streams:	%f ms (%f GB/s)
    ", STREAM_COUNT, overlap_time / NREPS, (NREPS * (memsize * 2e-6)) / overlap_time);
    208     printf("	Avg. speedup gained:       	%f ms
    ", (serial_time - overlap_time) / NREPS);
    209 
    210     printf("
    	Result test: %s.
    ", test() ? "Passed" : "Failed");
    211 
    212     // 回收工作
    213     free(h_data_source);
    214     free(h_data_sink);
    215     for (int i =0; i<STREAM_COUNT; ++i)
    216     {
    217         cudaFreeHost(h_data_in[i]);
    218         cudaFree(d_data_in[i]);
    219         cudaFreeHost(h_data_out[i]);
    220         cudaFree(d_data_out[i]);
    221         cudaStreamDestroy(stream[i]);
    222         cudaEventDestroy(cycleDone[i]);
    223     }
    224     cudaEventDestroy(start);
    225     cudaEventDestroy(stop);
    226 
    227     getchar();
    228     return 0;
    229 }

    ▶ 输出结果

        Start.
    
        Device [0]: GeForce GTX 1070, computation cability 6.1, 16 MP(s) x 128 (Cores/MP) = 2048 (Cores)
    
        Relevant properties of this CUDA device.
        (Y) Execute several GPU kernels simultaneously
        (Y) Overlap one CPU<->GPU data transfer with GPU kernel execution
        (Y) Overlap two CPU<->GPU data transfers with GPU kernel execution
    
        scale_factor = 1.00
        array_size   = 4194304
    
        Measured timings (throughput):
        Memcpy host to device:  1.276192 ms (13.146311 GB/s)
        Memcpy device to host:  1.279008 ms (13.117366 GB/s)
        Kernel:                 1.312768 ms (127.800314 GB/s)
    
        Theoretical limits for speedup gained from overlapped data transfers:
        No overlap (transfer-kernel-transfer):  3.867968 ms
        Overlap one transfer:                   2.555200 ms
        Overlap both data transfers:            1.312768 ms
    
        Average measured timings over 10 repetitions:
        Avg. time serialized:           3.992167 ms (8.405068 GB/s)
        Avg. time using 4 streams:      1.896141 ms (17.696171 GB/s)
        Avg. speedup gained:            2.096026 ms
    
        Result test: Passed.

    ▶ 涨姿势

    ● 没有

  • 相关阅读:
    在vue项目中使用stylus来实现移动端的1px
    Promise对象和回调函数,解决异步数据传递问题
    axios在实际项目中的使用介绍
    关于React.PropTypes的废除,以及新版本下的react的验证方式
    javascript之日期对象
    js Date 日期格式化(转)
    jquery无缝隙连续滚动代码
    8款惊艳的HTML5粒子动画特效
    web前端设计师们常用的jQuery特效插件汇总
    js/jQuery实现类似百度搜索功能
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7883831.html
Copyright © 2020-2023  润新知