cuda流测试
1 /* 2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 * 4 * NVIDIA Corporation and its licensors retain all intellectual property and 5 * proprietary rights in and to this software and related documentation. 6 * Any use, reproduction, disclosure, or distribution of this software 7 * and related documentation without an express license agreement from 8 * NVIDIA Corporation is strictly prohibited. 9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15 16 17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "cuda_runtime.h" 20 #include "device_launch_parameters.h" 21 #define N (1024*1024) 22 #define FULL_DATA_SIZE (N*20) 23 24 25 __global__ void kernel(int *a, int *b, int *c) { 26 int idx = threadIdx.x + blockIdx.x * blockDim.x; 27 if (idx < N) { 28 //idx后两个数 29 int idx1 = (idx + 1) % 256; 30 int idx2 = (idx + 2) % 256; 31 float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; 32 float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; 33 c[idx] = (as + bs) / 2; 34 } 35 } 36 37 38 int main(void) { 39 cudaDeviceProp prop; 40 int whichDevice; 41 HANDLE_ERROR(cudaGetDevice(&whichDevice)); 42 HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 43 if (!prop.deviceOverlap) { 44 printf("Device will not handle overlaps, so no speed up from streams "); 45 return 0; 46 } 47 48 cudaEvent_t start, stop; 49 float elapsedTime; 50 51 cudaStream_t stream; 52 int *host_a, *host_b, *host_c; 53 int *dev_a, *dev_b, *dev_c; 54 55 // start the timers 56 HANDLE_ERROR(cudaEventCreate(&start)); 57 HANDLE_ERROR(cudaEventCreate(&stop)); 58 59 //初始化流 60 HANDLE_ERROR(cudaStreamCreate(&stream)); 61 62 // allocate the memory on the GPU 63 HANDLE_ERROR(cudaMalloc((void**)&dev_a, 64 N * sizeof(int))); 65 HANDLE_ERROR(cudaMalloc((void**)&dev_b, 66 N * sizeof(int))); 67 HANDLE_ERROR(cudaMalloc((void**)&dev_c, 68 N * sizeof(int))); 69 70 //分配由于GPU访问的主机无分页内存(锁定内存页) 71 HANDLE_ERROR(cudaHostAlloc((void**)&host_a, 72 FULL_DATA_SIZE * sizeof(int), 73 cudaHostAllocDefault)); 74 HANDLE_ERROR(cudaHostAlloc((void**)&host_b, 75 FULL_DATA_SIZE * sizeof(int), 76 cudaHostAllocDefault)); 77 HANDLE_ERROR(cudaHostAlloc((void**)&host_c, 78 FULL_DATA_SIZE * sizeof(int), 79 cudaHostAllocDefault)); 80 81 for (int i = 0; i<FULL_DATA_SIZE; i++) { 82 host_a[i] = rand(); 83 host_b[i] = rand(); 84 } 85 86 HANDLE_ERROR(cudaEventRecord(start, 0)); 87 // now loop over full data, in bite-sized chunks 88 for (int i = 0; i<FULL_DATA_SIZE; i += N) { 89 //异步复制主机上内存的值到设备上 90 HANDLE_ERROR(cudaMemcpyAsync(dev_a, host_a + i, 91 N * sizeof(int), 92 cudaMemcpyHostToDevice, 93 stream)); 94 HANDLE_ERROR(cudaMemcpyAsync(dev_b, host_b + i, 95 N * sizeof(int), 96 cudaMemcpyHostToDevice, 97 stream)); 98 99 kernel << <N / 256, 256, 0, stream >> >(dev_a, dev_b, dev_c); 100 101 //将计算的值复制会主机 102 HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c, 103 N * sizeof(int), 104 cudaMemcpyDeviceToHost, 105 stream)); 106 107 } 108 //从锁定页将结果块复制到主机内存 109 HANDLE_ERROR(cudaStreamSynchronize(stream)); 110 111 HANDLE_ERROR(cudaEventRecord(stop, 0)); 112 113 HANDLE_ERROR(cudaEventSynchronize(stop)); 114 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, 115 start, stop)); 116 printf("Time taken: %3.1f ms ", elapsedTime); 117 118 // cleanup the streams and memory 119 HANDLE_ERROR(cudaFreeHost(host_a)); 120 HANDLE_ERROR(cudaFreeHost(host_b)); 121 HANDLE_ERROR(cudaFreeHost(host_c)); 122 HANDLE_ERROR(cudaFree(dev_a)); 123 HANDLE_ERROR(cudaFree(dev_b)); 124 HANDLE_ERROR(cudaFree(dev_c)); 125 HANDLE_ERROR(cudaStreamDestroy(stream)); 126 127 return 0; 128 }
项目打包下载