不合理的代码
1 /* 2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 * 4 * NVIDIA Corporation and its licensors retain all intellectual property and 5 * proprietary rights in and to this software and related documentation. 6 * Any use, reproduction, disclosure, or distribution of this software 7 * and related documentation without an express license agreement from 8 * NVIDIA Corporation is strictly prohibited. 9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15 16 17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "cuda_runtime.h" 20 #include "device_launch_parameters.h" 21 #define N (1024*1024) 22 #define FULL_DATA_SIZE (N*20) 23 24 25 __global__ void kernel(int *a, int *b, int *c) { 26 int idx = threadIdx.x + blockIdx.x * blockDim.x; 27 if (idx < N) { 28 int idx1 = (idx + 1) % 256; 29 int idx2 = (idx + 2) % 256; 30 float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; 31 float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; 32 c[idx] = (as + bs) / 2; 33 } 34 } 35 36 37 int main(void) { 38 cudaDeviceProp prop; 39 int whichDevice; 40 HANDLE_ERROR(cudaGetDevice(&whichDevice)); 41 HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 42 if (!prop.deviceOverlap) { 43 printf("Device will not handle overlaps, so no speed up from streams "); 44 return 0; 45 } 46 47 cudaEvent_t start, stop; 48 float elapsedTime; 49 50 cudaStream_t stream0, stream1; 51 int *host_a, *host_b, *host_c; 52 int *dev_a0, *dev_b0, *dev_c0; 53 int *dev_a1, *dev_b1, *dev_c1; 54 55 // start the timers 56 HANDLE_ERROR(cudaEventCreate(&start)); 57 HANDLE_ERROR(cudaEventCreate(&stop)); 58 59 // initialize the streams 60 HANDLE_ERROR(cudaStreamCreate(&stream0)); 61 HANDLE_ERROR(cudaStreamCreate(&stream1)); 62 63 // allocate the memory on the GPU 64 HANDLE_ERROR(cudaMalloc((void**)&dev_a0, 65 N * sizeof(int))); 66 HANDLE_ERROR(cudaMalloc((void**)&dev_b0, 67 N * sizeof(int))); 68 HANDLE_ERROR(cudaMalloc((void**)&dev_c0, 69 N * sizeof(int))); 70 HANDLE_ERROR(cudaMalloc((void**)&dev_a1, 71 N * sizeof(int))); 72 HANDLE_ERROR(cudaMalloc((void**)&dev_b1, 73 N * sizeof(int))); 74 HANDLE_ERROR(cudaMalloc((void**)&dev_c1, 75 N * sizeof(int))); 76 77 // allocate host locked memory, used to stream 78 HANDLE_ERROR(cudaHostAlloc((void**)&host_a, 79 FULL_DATA_SIZE * sizeof(int), 80 cudaHostAllocDefault)); 81 HANDLE_ERROR(cudaHostAlloc((void**)&host_b, 82 FULL_DATA_SIZE * sizeof(int), 83 cudaHostAllocDefault)); 84 HANDLE_ERROR(cudaHostAlloc((void**)&host_c, 85 FULL_DATA_SIZE * sizeof(int), 86 cudaHostAllocDefault)); 87 88 for (int i = 0; i<FULL_DATA_SIZE; i++) { 89 host_a[i] = rand(); 90 host_b[i] = rand(); 91 } 92 93 HANDLE_ERROR(cudaEventRecord(start, 0)); 94 // now loop over full data, in bite-sized chunks 95 for (int i = 0; i<FULL_DATA_SIZE; i += N * 2) { 96 // copy the locked memory to the device, async 97 HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i, 98 N * sizeof(int), 99 cudaMemcpyHostToDevice, 100 stream0)); 101 HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i, 102 N * sizeof(int), 103 cudaMemcpyHostToDevice, 104 stream0)); 105 106 kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0); 107 108 // copy the data from device to locked memory 109 HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0, 110 N * sizeof(int), 111 cudaMemcpyDeviceToHost, 112 stream0)); 113 114 115 // copy the locked memory to the device, async 116 HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N, 117 N * sizeof(int), 118 cudaMemcpyHostToDevice, 119 stream1)); 120 HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N, 121 N * sizeof(int), 122 cudaMemcpyHostToDevice, 123 stream1)); 124 125 kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1); 126 127 // copy the data from device to locked memory 128 HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1, 129 N * sizeof(int), 130 cudaMemcpyDeviceToHost, 131 stream1)); 132 } 133 HANDLE_ERROR(cudaStreamSynchronize(stream0)); 134 HANDLE_ERROR(cudaStreamSynchronize(stream1)); 135 136 HANDLE_ERROR(cudaEventRecord(stop, 0)); 137 138 HANDLE_ERROR(cudaEventSynchronize(stop)); 139 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, 140 start, stop)); 141 printf("Time taken: %3.1f ms ", elapsedTime); 142 143 // cleanup the streams and memory 144 HANDLE_ERROR(cudaFreeHost(host_a)); 145 HANDLE_ERROR(cudaFreeHost(host_b)); 146 HANDLE_ERROR(cudaFreeHost(host_c)); 147 HANDLE_ERROR(cudaFree(dev_a0)); 148 HANDLE_ERROR(cudaFree(dev_b0)); 149 HANDLE_ERROR(cudaFree(dev_c0)); 150 HANDLE_ERROR(cudaFree(dev_a1)); 151 HANDLE_ERROR(cudaFree(dev_b1)); 152 HANDLE_ERROR(cudaFree(dev_c1)); 153 HANDLE_ERROR(cudaStreamDestroy(stream0)); 154 HANDLE_ERROR(cudaStreamDestroy(stream1)); 155 156 return 0; 157 }
代码下载