这是一个cuda 自带的算例,包含cuda 计算的一般流程。
这个地址有比较清楚的cuda的介绍。感谢作者分享(http://blog.csdn.net/hjimce/article/details/51506207)
一般来说,cuda 计算的流程是:
1. 设置显卡编号:cudaSetDevice; 这个主要是在有多个GPU的机器上使用,其编号是从0号开始。
2. 为显卡开辟内存变量: cudaMalloc;使用方法:cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
这里的指针是指向设备端的内存地址,无法再主机端使用。
3.把主机端的数据拷贝到设备端:cudaMemcpy; 使用方法:
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
这里注意需要指明数据传输的地址,
4. 调用内核函数__global__ 类型函数;
cudaAdd<<<blocksPerGrid, threadsPerBlock>>> ( )
这里blocksPerGrid, threadsPerBlock 都是Dim3型的数据,
5. 把计算结果拷贝到主机端。
6. 释放显存空间。
1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 3 4 #include <stdio.h> 5 6 static void HandleError(cudaError_t err, 7 const char *file, 8 int line) { 9 if (err != cudaSuccess) { 10 printf("%s in %s at line %d ", cudaGetErrorString(err), 11 file, line); 12 exit(EXIT_FAILURE); 13 } 14 } 15 #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) 16 17 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size); 18 void printCudaInformation(); 19 20 __global__ void addKernel(int *c, const int *a, const int *b) 21 { 22 int i = threadIdx.x; 23 c[i] = a[i] + b[i]; 24 } 25 26 int main() 27 { 28 const int arraySize = 5; 29 const int a[arraySize] = { 1, 2, 3, 4, 5 }; 30 const int b[arraySize] = { 10, 20, 30, 40, 50 }; 31 int c[arraySize] = { 0 }; 32 33 // Add vectors in parallel. 34 HANDLE_ERROR( addWithCuda(c, a, b, arraySize) ); 35 36 printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d} ", 37 c[0], c[1], c[2], c[3], c[4]); 38 39 // cudaDeviceReset must be called before exiting in order for profiling and 40 // tracing tools such as Nsight and Visual Profiler to show complete traces. 41 HANDLE_ERROR( cudaDeviceReset() ); 42 43 system("pause"); 44 printCudaInformation(); 45 system("pause"); 46 return 0; 47 } 48 49 // Helper function for using CUDA to add vectors in parallel. 50 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size) 51 { 52 int *dev_a = 0; 53 int *dev_b = 0; 54 int *dev_c = 0; 55 cudaError_t cudaStatus=cudaSuccess; 56 57 // Choose which GPU to run on, change this on a multi-GPU system. 58 HANDLE_ERROR(cudaSetDevice(0)); 59 60 // Allocate GPU buffers for three vectors (two input, one output) 61 HANDLE_ERROR(cudaMalloc((void**)&dev_c, size * sizeof(int))); 62 HANDLE_ERROR(cudaMalloc((void**)&dev_a, size * sizeof(int))); 63 HANDLE_ERROR(cudaMalloc((void**)&dev_b, size * sizeof(int))); 64 65 // Copy input vectors from host memory to GPU buffers. 66 HANDLE_ERROR(cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice)); 67 HANDLE_ERROR(cudaMemcpy(dev_b, a, size * sizeof(int), cudaMemcpyHostToDevice)); 68 69 70 // Launch a kernel on the GPU with one thread for each element. 71 addKernel<<<1, size>>>(dev_c, dev_a, dev_b); 72 73 // Check for any errors launching the kernel 74 HANDLE_ERROR(cudaGetLastError()); 75 76 // cudaDeviceSynchronize waits for the kernel to finish, and returns 77 // any errors encountered during the launch. 78 HANDLE_ERROR(cudaDeviceSynchronize()); 79 80 // Copy output vector from GPU buffer to host memory. 81 HANDLE_ERROR(cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost)); 82 83 return cudaStatus; 84 } 85 86 void printCudaInformation() 87 { 88 int count; 89 cudaGetDeviceCount(&count); 90 printf("count=%d ", count); 91 cudaDeviceProp myProp; 92 cudaGetDeviceProperties(&myProp, 0); 93 printf(" --- General Information of My Cuda Device --- "); 94 printf(" Device name: %s ", myProp.name); 95 printf(" Computer capatibility : %d.%d ", myProp.major, myProp.minor); 96 printf(" Clock rate: %d ", myProp.clockRate); 97 98 printf(" --- Memory Information of My Cuda Device --- "); 99 printf(" Total global memory: %ld =%d double ", myProp.totalGlobalMem, myProp.totalGlobalMem / sizeof(double)); 100 printf(" Total const memory: %ld =%d int ", myProp.totalConstMem, myProp.totalConstMem / sizeof(int)); 101 printf(" max memoory pitch: %ld ", myProp.memPitch); 102 103 printf(" --- Multiprocessor Information of My Cuda Device --- "); 104 printf(" multprocessor count= %d ", myProp.multiProcessorCount); 105 printf(" Shared mem per mp=%d ", myProp.sharedMemPerBlock); 106 printf(" Registers per mp=%d ", myProp.regsPerBlock); 107 printf(" Thread in wrap=%d ", myProp.warpSize); 108 printf(" Max thread per block=%d ", myProp.maxThreadsPerBlock); 109 printf(" Max threads dimensions= (%d, %d, %d) ", 110 myProp.maxThreadsDim[0], myProp.maxThreadsDim[1], myProp.maxThreadsDim[2]); 111 printf(" Max Grid dimensions= (%d, %d, %d) ", 112 myProp.maxGridSize[0], myProp.maxGridSize[1], myProp.maxGridSize[2]); 113 printf(" "); 114 }