• basic_double_stream_incorrect


    不合理的代码

      1 /*
      2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
      3 *
      4 * NVIDIA Corporation and its licensors retain all intellectual property and
      5 * proprietary rights in and to this software and related documentation.
      6 * Any use, reproduction, disclosure, or distribution of this software
      7 * and related documentation without an express license agreement from
      8 * NVIDIA Corporation is strictly prohibited.
      9 *
     10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
     11 * associated with this source code for terms and conditions that govern
     12 * your use of this NVIDIA software.
     13 *
     14 */
     15 
     16 
     17 #include "../common/book.h"
     18 #include "cuda.h"
     19 #include "cuda_runtime.h"
     20 #include "device_launch_parameters.h"
     21 #define N   (1024*1024)
     22 #define FULL_DATA_SIZE   (N*20)
     23 
     24 
     25 __global__ void kernel(int *a, int *b, int *c) {
     26     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     27     if (idx < N) {
     28         int idx1 = (idx + 1) % 256;
     29         int idx2 = (idx + 2) % 256;
     30         float   as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
     31         float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
     32         c[idx] = (as + bs) / 2;
     33     }
     34 }
     35 
     36 
     37 int main(void) {
     38     cudaDeviceProp  prop;
     39     int whichDevice;
     40     HANDLE_ERROR(cudaGetDevice(&whichDevice));
     41     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
     42     if (!prop.deviceOverlap) {
     43         printf("Device will not handle overlaps, so no speed up from streams
    ");
     44         return 0;
     45     }
     46 
     47     cudaEvent_t     start, stop;
     48     float           elapsedTime;
     49 
     50     cudaStream_t    stream0, stream1;
     51     int *host_a, *host_b, *host_c;
     52     int *dev_a0, *dev_b0, *dev_c0;
     53     int *dev_a1, *dev_b1, *dev_c1;
     54 
     55     // start the timers
     56     HANDLE_ERROR(cudaEventCreate(&start));
     57     HANDLE_ERROR(cudaEventCreate(&stop));
     58 
     59     // initialize the streams
     60     HANDLE_ERROR(cudaStreamCreate(&stream0));
     61     HANDLE_ERROR(cudaStreamCreate(&stream1));
     62 
     63     // allocate the memory on the GPU
     64     HANDLE_ERROR(cudaMalloc((void**)&dev_a0,
     65         N * sizeof(int)));
     66     HANDLE_ERROR(cudaMalloc((void**)&dev_b0,
     67         N * sizeof(int)));
     68     HANDLE_ERROR(cudaMalloc((void**)&dev_c0,
     69         N * sizeof(int)));
     70     HANDLE_ERROR(cudaMalloc((void**)&dev_a1,
     71         N * sizeof(int)));
     72     HANDLE_ERROR(cudaMalloc((void**)&dev_b1,
     73         N * sizeof(int)));
     74     HANDLE_ERROR(cudaMalloc((void**)&dev_c1,
     75         N * sizeof(int)));
     76 
     77     // allocate host locked memory, used to stream
     78     HANDLE_ERROR(cudaHostAlloc((void**)&host_a,
     79         FULL_DATA_SIZE * sizeof(int),
     80         cudaHostAllocDefault));
     81     HANDLE_ERROR(cudaHostAlloc((void**)&host_b,
     82         FULL_DATA_SIZE * sizeof(int),
     83         cudaHostAllocDefault));
     84     HANDLE_ERROR(cudaHostAlloc((void**)&host_c,
     85         FULL_DATA_SIZE * sizeof(int),
     86         cudaHostAllocDefault));
     87 
     88     for (int i = 0; i<FULL_DATA_SIZE; i++) {
     89         host_a[i] = rand();
     90         host_b[i] = rand();
     91     }
     92 
     93     HANDLE_ERROR(cudaEventRecord(start, 0));
     94     // now loop over full data, in bite-sized chunks
     95     for (int i = 0; i<FULL_DATA_SIZE; i += N * 2) {
     96         // copy the locked memory to the device, async
     97         HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i,
     98             N * sizeof(int),
     99             cudaMemcpyHostToDevice,
    100             stream0));
    101         HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,
    102             N * sizeof(int),
    103             cudaMemcpyHostToDevice,
    104             stream0));
    105 
    106         kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0);
    107 
    108         // copy the data from device to locked memory
    109         HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,
    110             N * sizeof(int),
    111             cudaMemcpyDeviceToHost,
    112             stream0));
    113 
    114 
    115         // copy the locked memory to the device, async
    116         HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,
    117             N * sizeof(int),
    118             cudaMemcpyHostToDevice,
    119             stream1));
    120         HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,
    121             N * sizeof(int),
    122             cudaMemcpyHostToDevice,
    123             stream1));
    124 
    125         kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1);
    126 
    127         // copy the data from device to locked memory
    128         HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,
    129             N * sizeof(int),
    130             cudaMemcpyDeviceToHost,
    131             stream1));
    132     }
    133     HANDLE_ERROR(cudaStreamSynchronize(stream0));
    134     HANDLE_ERROR(cudaStreamSynchronize(stream1));
    135 
    136     HANDLE_ERROR(cudaEventRecord(stop, 0));
    137 
    138     HANDLE_ERROR(cudaEventSynchronize(stop));
    139     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
    140         start, stop));
    141     printf("Time taken:  %3.1f ms
    ", elapsedTime);
    142 
    143     // cleanup the streams and memory
    144     HANDLE_ERROR(cudaFreeHost(host_a));
    145     HANDLE_ERROR(cudaFreeHost(host_b));
    146     HANDLE_ERROR(cudaFreeHost(host_c));
    147     HANDLE_ERROR(cudaFree(dev_a0));
    148     HANDLE_ERROR(cudaFree(dev_b0));
    149     HANDLE_ERROR(cudaFree(dev_c0));
    150     HANDLE_ERROR(cudaFree(dev_a1));
    151     HANDLE_ERROR(cudaFree(dev_b1));
    152     HANDLE_ERROR(cudaFree(dev_c1));
    153     HANDLE_ERROR(cudaStreamDestroy(stream0));
    154     HANDLE_ERROR(cudaStreamDestroy(stream1));
    155 
    156     return 0;
    157 }

    代码下载

  • 相关阅读:
    严格模式
    es6模块与 commonJS规范的区别
    Javascript内置对象、原生对象、宿主对象关系
    实现继承的几种方式
    创建对象的一些方式
    null的小扩展
    getElementById的缩略
    你真的知道为什么不推荐使用@import?
    换行与不换行
    transition与animation
  • 原文地址:https://www.cnblogs.com/liangliangdetianxia/p/3996364.html
Copyright © 2020-2023  润新知