• CUDA实例练习(十):多个cuda流


      1 #include <stdio.h>
      2 #include <cuda_runtime.h>
      3 #include <device_launch_parameters.h>
      4 #include "book.h"
      5 
      6 #define N (1024*1024)
      7 #define FULL_DATA_SIZE (N * 20)
      8 
      9 __global__ void kernel(int *a, int *b, int *c){
     10     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     11     if (idx < N){
     12         int idx1 = (idx + 1) % 256;
     13         int idx2 = (idx + 2) % 256;
     14         float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
     15         float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
     16         c[idx] = (as + bs) / 2;
     17     }
     18 }
     19 
     20 int main(void){
     21     cudaDeviceProp prop;
     22     int whichDevice;
     23     HANDLE_ERROR(cudaGetDevice(&whichDevice));
     24     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
     25     if (!prop.deviceOverlap){
     26         printf("Device will not handle overlaps, so no speed up from streams
    ");
     27         return 0;
     28     }
     29     cudaEvent_t start, stop;
     30     float elapsedTime;
     31 
     32     //启动计时器
     33     HANDLE_ERROR(cudaEventCreate(&start));
     34     HANDLE_ERROR(cudaEventCreate(&stop));
     35     HANDLE_ERROR(cudaEventRecord(start, 0));
     36 
     37     //初始化流
     38     cudaStream_t stream0, stream1;
     39     HANDLE_ERROR(cudaStreamCreate(&stream0));
     40     HANDLE_ERROR(cudaStreamCreate(&stream1));
     41 
     42     int *host_a, *host_b, *host_c;
     43     int *dev_a0, *dev_b0, *dev_c0;//为第0个流分配的GPU内存
     44     int *dev_a1, *dev_b1, *dev_c1;//为第1个流分配的GPU内存
     45 
     46     //在GPU上分配内存
     47     HANDLE_ERROR(cudaMalloc((void **)&dev_a0, N * sizeof(int)));
     48     HANDLE_ERROR(cudaMalloc((void **)&dev_b0, N * sizeof(int)));
     49     HANDLE_ERROR(cudaMalloc((void **)&dev_c0, N * sizeof(int)));
     50     HANDLE_ERROR(cudaMalloc((void **)&dev_a1, N * sizeof(int)));
     51     HANDLE_ERROR(cudaMalloc((void **)&dev_b1, N * sizeof(int)));
     52     HANDLE_ERROR(cudaMalloc((void **)&dev_c1, N * sizeof(int)));
     53 
     54     //分配在流中使用的页锁定内存
     55     HANDLE_ERROR(cudaHostAlloc((void **)&host_a, FULL_DATA_SIZE * sizeof(int),
     56         cudaHostAllocDefault));
     57     HANDLE_ERROR(cudaHostAlloc((void **)&host_b, FULL_DATA_SIZE * sizeof(int),
     58         cudaHostAllocDefault));
     59     HANDLE_ERROR(cudaHostAlloc((void **)&host_c, FULL_DATA_SIZE * sizeof(int),
     60         cudaHostAllocDefault));
     61 
     62     for (int i = 0; i < FULL_DATA_SIZE; i++){
     63         host_a[i] = rand();
     64         host_b[i] = rand();
     65     }
     66 
     67     //在整体数据上循环,每个数据块的大小为N
     68     for (int i = 0; i < FULL_DATA_SIZE; i += N * 2){
     69         //将锁定内存以异步方式复制到设备上
     70         HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i, N * sizeof(int),
     71             cudaMemcpyHostToDevice, stream0));
     72         HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i, N * sizeof(int),
     73             cudaMemcpyHostToDevice, stream0));
     74         kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0);
     75 
     76         //将数据从设备复制回锁定内存
     77         HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0, N * sizeof(int),
     78             cudaMemcpyDeviceToHost, stream0));
     79 
     80         //将锁定内存以异步方式复制到设备上
     81         HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N, N* sizeof(int),
     82             cudaMemcpyHostToDevice, stream1));
     83         HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N, N * sizeof(int),
     84             cudaMemcpyHostToDevice, stream1));
     85         kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1);
     86 
     87         //将数据从设备复制回到锁定内存
     88         HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1, N * sizeof(int),
     89             cudaMemcpyDeviceToHost, stream1));
     90     }
     91 
     92     //在停止应用程序的计时器之前,首先将两个流进行同步
     93     HANDLE_ERROR(cudaStreamSynchronize(stream0));
     94     HANDLE_ERROR(cudaStreamSynchronize(stream1));
     95     HANDLE_ERROR(cudaEventRecord(stop, 0));
     96     HANDLE_ERROR(cudaEventSynchronize(stop));
     97     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
     98     printf("Time taken: %3.1f ms
    ", elapsedTime);
     99 
    100     //释放流和内存
    101     HANDLE_ERROR(cudaFreeHost(host_a));
    102     HANDLE_ERROR(cudaFreeHost(host_b));
    103     HANDLE_ERROR(cudaFreeHost(host_c));
    104     HANDLE_ERROR(cudaFree(dev_a0));
    105     HANDLE_ERROR(cudaFree(dev_b0));
    106     HANDLE_ERROR(cudaFree(dev_c0));
    107     HANDLE_ERROR(cudaFree(dev_a1));
    108     HANDLE_ERROR(cudaFree(dev_b1));
    109     HANDLE_ERROR(cudaFree(dev_c1));
    110     HANDLE_ERROR(cudaStreamDestroy(stream0));
    111     HANDLE_ERROR(cudaStreamDestroy(stream1));
    112 
    113     return 0;
    114 
    115 
    116 
    117 }

    如果同时调度某个流的所有操作,那么很容易在无意中阻塞另一个流的复制操作或者核函数执行。要解决这个问题,在将操作放入流的队列时应采用宽度优先方式,而非深度优先方式。

      1 #include <stdio.h>
      2 #include <cuda_runtime.h>
      3 #include <device_launch_parameters.h>
      4 #include "book.h"
      5 
      6 #define N (1024*1024)
      7 #define FULL_DATA_SIZE (N * 20)
      8 
      9 __global__ void kernel(int *a, int *b, int *c){
     10     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     11     if (idx < N){
     12         int idx1 = (idx + 1) % 256;
     13         int idx2 = (idx + 2) % 256;
     14         float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
     15         float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
     16         c[idx] = (as + bs) / 2;
     17     }
     18 }
     19 
     20 int main(void){
     21     cudaDeviceProp prop;
     22     int whichDevice;
     23     HANDLE_ERROR(cudaGetDevice(&whichDevice));
     24     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
     25     if (!prop.deviceOverlap){
     26         printf("Device will not handle overlaps, so no speed up from streams
    ");
     27         return 0;
     28     }
     29     cudaEvent_t start, stop;
     30     float elapsedTime;
     31 
     32     //启动计时器
     33     HANDLE_ERROR(cudaEventCreate(&start));
     34     HANDLE_ERROR(cudaEventCreate(&stop));
     35     HANDLE_ERROR(cudaEventRecord(start, 0));
     36 
     37     //初始化流
     38     cudaStream_t stream0, stream1;
     39     HANDLE_ERROR(cudaStreamCreate(&stream0));
     40     HANDLE_ERROR(cudaStreamCreate(&stream1));
     41 
     42     int *host_a, *host_b, *host_c;
     43     int *dev_a0, *dev_b0, *dev_c0;//为第0个流分配的GPU内存
     44     int *dev_a1, *dev_b1, *dev_c1;//为第1个流分配的GPU内存
     45 
     46     //在GPU上分配内存
     47     HANDLE_ERROR(cudaMalloc((void **)&dev_a0, N * sizeof(int)));
     48     HANDLE_ERROR(cudaMalloc((void **)&dev_b0, N * sizeof(int)));
     49     HANDLE_ERROR(cudaMalloc((void **)&dev_c0, N * sizeof(int)));
     50     HANDLE_ERROR(cudaMalloc((void **)&dev_a1, N * sizeof(int)));
     51     HANDLE_ERROR(cudaMalloc((void **)&dev_b1, N * sizeof(int)));
     52     HANDLE_ERROR(cudaMalloc((void **)&dev_c1, N * sizeof(int)));
     53 
     54     //分配在流中使用的页锁定内存
     55     HANDLE_ERROR(cudaHostAlloc((void **)&host_a, FULL_DATA_SIZE * sizeof(int),
     56         cudaHostAllocDefault));
     57     HANDLE_ERROR(cudaHostAlloc((void **)&host_b, FULL_DATA_SIZE * sizeof(int),
     58         cudaHostAllocDefault));
     59     HANDLE_ERROR(cudaHostAlloc((void **)&host_c, FULL_DATA_SIZE * sizeof(int),
     60         cudaHostAllocDefault));
     61 
     62     for (int i = 0; i < FULL_DATA_SIZE; i++){
     63         host_a[i] = rand();
     64         host_b[i] = rand();
     65     }
     66 
     67     //在整体数据上循环,每个数据块的大小为N
     68     for (int i = 0; i<FULL_DATA_SIZE; i += N * 2) {
     69         // enqueue copies of a in stream0 and stream1
     70         HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i,
     71             N * sizeof(int),
     72             cudaMemcpyHostToDevice,
     73             stream0));
     74         HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,
     75             N * sizeof(int),
     76             cudaMemcpyHostToDevice,
     77             stream1));
     78         // enqueue copies of b in stream0 and stream1
     79         HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,
     80             N * sizeof(int),
     81             cudaMemcpyHostToDevice,
     82             stream0));
     83         HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,
     84             N * sizeof(int),
     85             cudaMemcpyHostToDevice,
     86             stream1));
     87 
     88         // enqueue kernels in stream0 and stream1   
     89         kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0);
     90         kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1);
     91 
     92         // enqueue copies of c from device to locked memory
     93         HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,
     94             N * sizeof(int),
     95             cudaMemcpyDeviceToHost,
     96             stream0));
     97         HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,
     98             N * sizeof(int),
     99             cudaMemcpyDeviceToHost,
    100             stream1));
    101     }
    102 
    103 
    104     //在停止应用程序的计时器之前,首先将两个流进行同步
    105     HANDLE_ERROR(cudaStreamSynchronize(stream0));
    106     HANDLE_ERROR(cudaStreamSynchronize(stream1));
    107     HANDLE_ERROR(cudaEventRecord(stop, 0));
    108     HANDLE_ERROR(cudaEventSynchronize(stop));
    109     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
    110     printf("Time taken: %3.1f ms
    ", elapsedTime);
    111 
    112     //释放流和内存
    113     HANDLE_ERROR(cudaFreeHost(host_a));
    114     HANDLE_ERROR(cudaFreeHost(host_b));
    115     HANDLE_ERROR(cudaFreeHost(host_c));
    116     HANDLE_ERROR(cudaFree(dev_a0));
    117     HANDLE_ERROR(cudaFree(dev_b0));
    118     HANDLE_ERROR(cudaFree(dev_c0));
    119     HANDLE_ERROR(cudaFree(dev_a1));
    120     HANDLE_ERROR(cudaFree(dev_b1));
    121     HANDLE_ERROR(cudaFree(dev_c1));
    122     HANDLE_ERROR(cudaStreamDestroy(stream0));
    123     HANDLE_ERROR(cudaStreamDestroy(stream1));
    124 
    125     return 0;
    126 
    127 
    128 
    129 }

  • 相关阅读:
    tasker支持的shell 命令大全
    crx 文件安装 如何安装 Chrome插件
    python mac地址计算
    Java线程池
    springMVC请求过程
    java中特殊的String类型
    单例模式和多例模式
    hash算法学习
    arraylist和linkedlist的简单比较
    乐观锁和悲观锁
  • 原文地址:https://www.cnblogs.com/zhangshuwen/p/7348226.html
Copyright © 2020-2023  润新知