• cuda流测试=basic_single_stream


    cuda流测试

      1 /*
      2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
      3 *
      4 * NVIDIA Corporation and its licensors retain all intellectual property and
      5 * proprietary rights in and to this software and related documentation.
      6 * Any use, reproduction, disclosure, or distribution of this software
      7 * and related documentation without an express license agreement from
      8 * NVIDIA Corporation is strictly prohibited.
      9 *
     10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
     11 * associated with this source code for terms and conditions that govern
     12 * your use of this NVIDIA software.
     13 *
     14 */
     15 
     16 
     17 #include "../common/book.h"
     18 #include "cuda.h"
     19 #include "cuda_runtime.h"
     20 #include "device_launch_parameters.h"
     21 #define N   (1024*1024)
     22 #define FULL_DATA_SIZE   (N*20)
     23 
     24 
     25 __global__ void kernel(int *a, int *b, int *c) {
     26     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     27     if (idx < N) {
     28         //idx后两个数
     29         int idx1 = (idx + 1) % 256;
     30         int idx2 = (idx + 2) % 256;
     31         float   as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
     32         float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
     33         c[idx] = (as + bs) / 2;
     34     }
     35 }
     36 
     37 
     38 int main(void) {
     39     cudaDeviceProp  prop;
     40     int whichDevice;
     41     HANDLE_ERROR(cudaGetDevice(&whichDevice));
     42     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
     43     if (!prop.deviceOverlap) {
     44         printf("Device will not handle overlaps, so no speed up from streams
    ");
     45         return 0;
     46     }
     47 
     48     cudaEvent_t     start, stop;
     49     float           elapsedTime;
     50 
     51     cudaStream_t    stream;
     52     int *host_a, *host_b, *host_c;
     53     int *dev_a, *dev_b, *dev_c;
     54 
     55     // start the timers
     56     HANDLE_ERROR(cudaEventCreate(&start));
     57     HANDLE_ERROR(cudaEventCreate(&stop));
     58 
     59     //初始化流
     60     HANDLE_ERROR(cudaStreamCreate(&stream));
     61 
     62     // allocate the memory on the GPU
     63     HANDLE_ERROR(cudaMalloc((void**)&dev_a,
     64         N * sizeof(int)));
     65     HANDLE_ERROR(cudaMalloc((void**)&dev_b,
     66         N * sizeof(int)));
     67     HANDLE_ERROR(cudaMalloc((void**)&dev_c,
     68         N * sizeof(int)));
     69 
     70     //分配由于GPU访问的主机无分页内存(锁定内存页)
     71     HANDLE_ERROR(cudaHostAlloc((void**)&host_a,
     72         FULL_DATA_SIZE * sizeof(int),
     73         cudaHostAllocDefault));
     74     HANDLE_ERROR(cudaHostAlloc((void**)&host_b,
     75         FULL_DATA_SIZE * sizeof(int),
     76         cudaHostAllocDefault));
     77     HANDLE_ERROR(cudaHostAlloc((void**)&host_c,
     78         FULL_DATA_SIZE * sizeof(int),
     79         cudaHostAllocDefault));
     80 
     81     for (int i = 0; i<FULL_DATA_SIZE; i++) {
     82         host_a[i] = rand();
     83         host_b[i] = rand();
     84     }
     85 
     86     HANDLE_ERROR(cudaEventRecord(start, 0));
     87     // now loop over full data, in bite-sized chunks
     88     for (int i = 0; i<FULL_DATA_SIZE; i += N) {
     89         //异步复制主机上内存的值到设备上
     90         HANDLE_ERROR(cudaMemcpyAsync(dev_a, host_a + i,
     91             N * sizeof(int),
     92             cudaMemcpyHostToDevice,
     93             stream));
     94         HANDLE_ERROR(cudaMemcpyAsync(dev_b, host_b + i,
     95             N * sizeof(int),
     96             cudaMemcpyHostToDevice,
     97             stream));
     98 
     99         kernel << <N / 256, 256, 0, stream >> >(dev_a, dev_b, dev_c);
    100 
    101         //将计算的值复制会主机
    102         HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c,
    103             N * sizeof(int),
    104             cudaMemcpyDeviceToHost,
    105             stream));
    106 
    107     }
    108     //从锁定页将结果块复制到主机内存
    109     HANDLE_ERROR(cudaStreamSynchronize(stream));
    110 
    111     HANDLE_ERROR(cudaEventRecord(stop, 0));
    112 
    113     HANDLE_ERROR(cudaEventSynchronize(stop));
    114     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
    115         start, stop));
    116     printf("Time taken:  %3.1f ms
    ", elapsedTime);
    117 
    118     // cleanup the streams and memory
    119     HANDLE_ERROR(cudaFreeHost(host_a));
    120     HANDLE_ERROR(cudaFreeHost(host_b));
    121     HANDLE_ERROR(cudaFreeHost(host_c));
    122     HANDLE_ERROR(cudaFree(dev_a));
    123     HANDLE_ERROR(cudaFree(dev_b));
    124     HANDLE_ERROR(cudaFree(dev_c));
    125     HANDLE_ERROR(cudaStreamDestroy(stream));
    126 
    127     return 0;
    128 }

    项目打包下载

  • 相关阅读:
    linux 笔试题
    shell -Z- d等等代表
    shell中for循环总结
    linux启动过程
    linux面试题3
    linux面试题2
    小峰servlet/jsp(4)EL表达式
    小峰servlet/jsp(3)登陆功能实现
    小峰servlet/jsp(2)
    java日期比较例子等...
  • 原文地址:https://www.cnblogs.com/liangliangdetianxia/p/3996333.html
Copyright © 2020-2023  润新知