cuda编程实战

参考文档：
https://blog.csdn.net/shuzfan/article/category/7072956
https://www.cnblogs.com/1024incn/p/4537010.html
https://www.cnblogs.com/1024incn/tag/CUDA/

cuda第一个例子

流程

创建源文件以.cu命名
用nvcc编译程序
运行程序

创建文件hello.cu

#include <stdio.h>
__global__ void helloFromGPU(void)
{
    printf("Hello World from GPU!
");
}

int main(void)
{
    printf("Hello World from CPU!
");
    helloFromGPU <<<1, 10>>>();
    cudaDeviceReset();
    return 0;
}

编译运行

nvcc -arch sm_20 hello.cu -o hello
./hello

__global__为CUDA C为标准C增加的修饰符，表示该函数将会交给编译设备代码的编译器(NVCC)并最终在设备上运行。
调用形式为：
helloFromGPU<<<1,10>>>();
一个kernel是由一组线程执行，所有线程执行相同的代码。上面一行三对尖括号中的1和10 表明了该function将有10个线程。

一个典型的CUDA程序结构包含五个主要步骤：

分配GPU空间。
将数据从CPU端复制到GPU端。
调用CUDA kernel来执行计算。
计算完成后将数据从GPU拷贝回CPU。
清理GPU内存空间。

第二个例子

cuda_second.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

// 接口函数： 主机代码调用GPU设备实现矢量加法 c = a + b
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

// 核函数：每个线程负责一个分量的加法
__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x; // 获取线程ID
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

    // 并行矢量相加
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}
",
        c[0], c[1], c[2], c[3], c[4]);

    // CUDA设备重置，以便其它性能检测和跟踪工具的运行，如Nsight and Visual Profiler to show complete traces.traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// 接口函数实现： 主机代码调用GPU设备实现矢量加法 c = a + b
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // 选择程序运行在哪块GPU上，(多GPU机器可以选择)
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // 依次为 c = a + b三个矢量在GPU上开辟内存 .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // 将矢量a和b依次copy进入GPU内存中
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // 运行核函数，运行设置为1个block，每个block中size个线程
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // 检查是否出现了错误
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s
", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // 停止CPU端线程的执行，直到GPU完成之前CUDA的任务，包括kernel函数、数据拷贝等
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!
", cudaStatus);
        goto Error;
    }

    // 将计算结果从GPU复制到主机内存
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);

    return cudaStatus;
}

编译运行

nvcc -arch sm_20 cuda_second.cu -o second
./second

相关阅读:
enca工具，检测文件编码
 ubuntu 支持gbk
LinkedList线程安全问题
 php防止form重复提交的方法
 Linux 内存泄露调试工具
 从B树、B+树、B*树谈到R 树
 Ubuntu 语言设置
 wwwauthenticate
Lua脚本语法说明
 jQuery学习总结之元素的相对定位和选择器持续更新中
原文地址：https://www.cnblogs.com/o-v-o/p/9975351.html