Cuda learn record two

这是一个cuda 自带的算例，包含cuda 计算的一般流程。

这个地址有比较清楚的cuda的介绍。感谢作者分享（http://blog.csdn.net/hjimce/article/details/51506207）

一般来说，cuda 计算的流程是:

1. 设置显卡编号：cudaSetDevice；这个主要是在有多个GPU的机器上使用，其编号是从0号开始。

2. 为显卡开辟内存变量： cudaMalloc；使用方法：cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));

这里的指针是指向设备端的内存地址，无法再主机端使用。

3.把主机端的数据拷贝到设备端：cudaMemcpy; 使用方法：

cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);

这里注意需要指明数据传输的地址，

4. 调用内核函数__global__ 类型函数；

cudaAdd<<<blocksPerGrid, threadsPerBlock>>> ( )

这里blocksPerGrid, threadsPerBlock 都是Dim3型的数据，

5. 把计算结果拷贝到主机端。

6. 释放显存空间。

  1 #include "cuda_runtime.h"
  2 #include "device_launch_parameters.h"
  3 
  4 #include <stdio.h>
  5 
  6 static void HandleError(cudaError_t err,
  7     const char *file,
  8     int line) {
  9     if (err != cudaSuccess) {
 10         printf("%s in %s at line %d
", cudaGetErrorString(err),
 11             file, line);
 12         exit(EXIT_FAILURE);
 13     }
 14 }
 15 #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
 16 
 17 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
 18 void printCudaInformation();
 19 
 20 __global__ void addKernel(int *c, const int *a, const int *b)
 21 {
 22     int i = threadIdx.x;
 23     c[i] = a[i] + b[i];
 24 }
 25 
 26 int main()
 27 {
 28     const int arraySize = 5;
 29     const int a[arraySize] = { 1, 2, 3, 4, 5 };
 30     const int b[arraySize] = { 10, 20, 30, 40, 50 };
 31     int c[arraySize] = { 0 };
 32 
 33     // Add vectors in parallel.
 34     HANDLE_ERROR( addWithCuda(c, a, b, arraySize) );
 35 
 36     printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}
",
 37         c[0], c[1], c[2], c[3], c[4]);
 38 
 39     // cudaDeviceReset must be called before exiting in order for profiling and
 40     // tracing tools such as Nsight and Visual Profiler to show complete traces.
 41     HANDLE_ERROR( cudaDeviceReset() );
 42 
 43     system("pause");
 44     printCudaInformation();
 45     system("pause");
 46     return 0;
 47 }
 48 
 49 // Helper function for using CUDA to add vectors in parallel.
 50 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
 51 {
 52     int *dev_a = 0;
 53     int *dev_b = 0;
 54     int *dev_c = 0;
 55     cudaError_t cudaStatus=cudaSuccess;
 56 
 57     // Choose which GPU to run on, change this on a multi-GPU system.
 58     HANDLE_ERROR(cudaSetDevice(0));
 59 
 60     // Allocate GPU buffers for three vectors (two input, one output)   
 61     HANDLE_ERROR(cudaMalloc((void**)&dev_c, size * sizeof(int)));
 62     HANDLE_ERROR(cudaMalloc((void**)&dev_a, size * sizeof(int)));
 63     HANDLE_ERROR(cudaMalloc((void**)&dev_b, size * sizeof(int)));
 64 
 65     // Copy input vectors from host memory to GPU buffers.
 66     HANDLE_ERROR(cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice));
 67     HANDLE_ERROR(cudaMemcpy(dev_b, a, size * sizeof(int), cudaMemcpyHostToDevice));
 68 
 69 
 70     // Launch a kernel on the GPU with one thread for each element.
 71     addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
 72 
 73     // Check for any errors launching the kernel
 74     HANDLE_ERROR(cudaGetLastError());
 75     
 76     // cudaDeviceSynchronize waits for the kernel to finish, and returns
 77     // any errors encountered during the launch.
 78     HANDLE_ERROR(cudaDeviceSynchronize());
 79 
 80     // Copy output vector from GPU buffer to host memory.
 81     HANDLE_ERROR(cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost));
 82     
 83     return cudaStatus;
 84 }
 85 
 86 void printCudaInformation()
 87 {
 88     int count;
 89     cudaGetDeviceCount(&count);
 90     printf("count=%d 
", count);
 91     cudaDeviceProp myProp;
 92     cudaGetDeviceProperties(&myProp, 0);
 93     printf(" --- General Information of My Cuda Device ---
");
 94     printf("     Device name: %s
", myProp.name);
 95     printf("     Computer capatibility : %d.%d
", myProp.major, myProp.minor);
 96     printf("     Clock rate: %d
", myProp.clockRate);
 97 
 98     printf(" --- Memory Information of My Cuda Device ---
");
 99     printf("    Total global memory: %ld =%d double 
", myProp.totalGlobalMem, myProp.totalGlobalMem / sizeof(double));
100     printf("    Total const memory: %ld =%d int 
", myProp.totalConstMem, myProp.totalConstMem / sizeof(int));
101     printf("    max memoory pitch: %ld 
", myProp.memPitch);
102 
103     printf(" --- Multiprocessor Information of My Cuda Device ---
");
104     printf("    multprocessor count= %d
", myProp.multiProcessorCount);
105     printf("    Shared mem per mp=%d
", myProp.sharedMemPerBlock);
106     printf("    Registers per mp=%d
", myProp.regsPerBlock);
107     printf("    Thread in wrap=%d
", myProp.warpSize);
108     printf("    Max thread per block=%d
", myProp.maxThreadsPerBlock);
109     printf("    Max threads dimensions= (%d, %d, %d) 
",
110         myProp.maxThreadsDim[0], myProp.maxThreadsDim[1], myProp.maxThreadsDim[2]);
111     printf("    Max Grid dimensions= (%d, %d, %d) 
",
112         myProp.maxGridSize[0], myProp.maxGridSize[1], myProp.maxGridSize[2]);
113     printf("
");
114 }

相关阅读:
云服务器
 发布一个Codesmith 模版，生成Linq 增删除改。看看有什么问题，请大家指点
 标记：今天终于搭建好了SharpDevelop的编译环境
 高效地根据属性名获取某对象的属性值
 如何使用C#操作快捷方式(获取快捷方式属性、创建快捷方式)
成功编译Chrome浏览器(编译Chromium)
问题：某厂部班组有6个人，每周每人都要上5天班，而且每人都要连续休息两天？
istio Egress Gateway 统一流量出口
 前端笔记
 node：将json转换ts
原文地址：https://www.cnblogs.com/cofludy/p/6925642.html