• CUDA学习:第一CUDA代码:数组求和


    今天有些收获了,成功运行了数组求和代码:就是将N个数相加求和

    //环境:CUDA5.0,vs2010

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"

    #include <stdio.h>

    cudaError_t addWithCuda(int *c, int *a);


    #define TOTALN 72120
    #define BLOCKS_PerGrid 32
    #define THREADS_PerBlock 64 //2^8

    __global__ void SumArray(int *c, int *a)//, int *b)
    {
    __shared__ unsigned int mycache[THREADS_PerBlock];//设置每个块内同享内存threadsPerBlock==blockDim.x

    int i = threadIdx.x+blockIdx.x*blockDim.x;
    int j = gridDim.x*blockDim.x;//每个grid里一共有多少个线程
    int cacheN;
    unsigned sum,k;

    sum=0;

    cacheN=threadIdx.x; //

    while(i<TOTALN)
    {
    sum += a[i];// + b[i];
    i = i+j;
    }

    mycache[cacheN]=sum;

    __syncthreads();//对线程块进行同步;等待该块里所有线程都计算结束


    //下面开始计算本block中每个线程得到的sum(保存在mycache)的和
    //递归方法:(参考《GPU高性能编程CUDA实战中文》)
    //1:线程对半加:

    k=THREADS_PerBlock>>1;
    while(k)
    {
    if(cacheN<k)
    {
    //线程号小于一半的线程继续运行这里加
    mycache[cacheN] += mycache[cacheN+k];//数组序列对半加,得到结果,放到前半部分数组,为下次递归准备
    }
    __syncthreads();//对线程块进行同步;等待该块里所有线程都计算结束
    k=k>>1;//数组序列,继续对半,准备后面的递归
    }

    //最后一次递归是在该块的线程0中进行,所有把线程0里的结果返回给CPU
    if(cacheN==0)
    {
    c[blockIdx.x]=mycache[0];
    }


    }

    int main()
    {

    int a[TOTALN] ;
    int c[BLOCKS_PerGrid] ;

    unsigned int j;
    for(j=0;j<TOTALN;j++)
    {
    //初始化数组,您可以自己填写数据,我这里用1
    a[j]=1;
    }

    // 进行并行求和
    cudaError_t cudaStatus = addWithCuda(c, a);

    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addWithCuda failed!");
    return 1;
    }

    unsigned int sum1,sum2;
    sum1=0;
    for(j=0;j<BLOCKS_PerGrid;j++)
    {
    sum1 +=c[j];
    }
    //用CPU验证和是否正确

    sum2=0;
    for(j=0;j<TOTALN;j++)
    {
    sum2 += a[j];
    }

    printf("sum1=%d; sum2=%d ",sum1,sum2);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
    }

    return 0;
    }

    // Helper function for using CUDA to add vectors in parallel.

    cudaError_t addWithCuda(int *c, int *a)
    {
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
    goto Error;
    }

    // 申请一个GPU内存空间,长度和main函数中c数组一样
    cudaStatus = cudaMalloc((void**)&dev_c, BLOCKS_PerGrid * sizeof(int));
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
    }
    // 申请一个GPU内存空间,长度和main函数中a数组一样
    cudaStatus = cudaMalloc((void**)&dev_a, TOTALN * sizeof(int));
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
    }

    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    // Copy input vectors from host memory to GPU buffers.
    //将a的数据从cpu中复制到GPU中
    cudaStatus = cudaMemcpy(dev_a, a, TOTALN * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
    }


    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    //////////////////////////////////////////////////


    // Launch a kernel on the GPU with one thread for each element.
    //启动GPU上的每个单元的线程
    SumArray<<<BLOCKS_PerGrid, THREADS_PerBlock>>>(dev_c, dev_a);//, dev_b);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    //等待全部线程运行结束
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel! ", cudaStatus);
    goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, BLOCKS_PerGrid * sizeof(int), cudaMemcpyDeviceToHost);
    //cudaStatus = cudaMemcpy(b, dev_b, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
    }

    Error:
    cudaFree(dev_c);
    cudaFree(dev_a);


    return cudaStatus;
    }

    www.shuleikeji.com
  • 相关阅读:
    《大话数据结构》第9章 排序 9.9 快速排序(下)
    [HTML5 DOM] dispatchEvent
    [AWS SAP] Exam Tips 2 Continues Improvement for Existing Solutions
    遇见C++ PPL:C++ 的并行和异步
    遇见C++ AMP:在GPU上做并行计算
    遇见C++ Lambda
    遇见C++ AMP:GPU的线程模型和内存模型
    服务器推技术
    转http状态码
    extjs同步与异步请求互换
  • 原文地址:https://www.cnblogs.com/dongchunxiao/p/4854751.html
Copyright © 2020-2023  润新知