• cuda编程学习6——点积dot


    __shared__ float cache[threadPerBlock];//声明共享内存缓冲区,__shared__

    __syncthreads();//对线程块中的线程进行同步,只有都完成前面的任务才可以进行后面的

    代码:

    /*
    ============================================================================
    Name : dot.cu
    Author : can
    Version :
    Copyright : Your copyright notice
    Description : CUDA compute reciprocals
    ============================================================================
    */

    #include <iostream>
    using namespace std;

    static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
    #define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

    #define imin(a,b) (a<b?a:b)
    const int N=33*1024;
    const int threadPerBlock=256;
    const int blockPerGrid=imin(32,(N+threadPerBlock-1)/threadPerBlock);

    __global__ void dot(float *a,float *b,float *c)
    {
    __shared__ float cache[threadPerBlock];//声明共享内存缓冲区,__shared__,
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    int cacheIndex = threadIdx.x;
    float temp = 0;
    while(tid < N)
    {
    temp += a[tid] * b[tid];
    tid += blockDim.x*gridDim.x;
    }
    cache[cacheIndex] = temp;
    __syncthreads();//对线程块中的线程进行同步,只有都完成前面的任务才可以进行后面的
    int i = blockDim.x/2;//归约运算
    while(i != 0)
    {
    if(cacheIndex < i)
    {
    cache[cacheIndex] += cache[cacheIndex + i];
    }
    __syncthreads();
    i /=2;
    }
    if(cacheIndex == 0)
    {
    c[blockIdx.x] = cache[0];
    }
    }

    int main()
    {
    float *a,*b,c,*partial_c;
    float *dev_a,*dev_b,*dev_partial_c;
    a = (float *)malloc(N*sizeof(float));
    b = (float *)malloc(N*sizeof(float));
    partial_c = (float *)malloc(blockPerGrid*sizeof(float));
    CUDA_CHECK_RETURN(cudaMalloc((void **)&dev_a,N*sizeof(float)));
    CUDA_CHECK_RETURN(cudaMalloc((void **)&dev_b,N*sizeof(float)));
    CUDA_CHECK_RETURN(cudaMalloc((void **)&dev_partial_c,N*sizeof(float)));
    for(int i=0;i<N;i++)
    {
    a[i] = i;
    b[i] = i*2;
    }
    CUDA_CHECK_RETURN(cudaMemcpy(dev_a,a,N*sizeof(float),cudaMemcpyHostToDevice));
    CUDA_CHECK_RETURN(cudaMemcpy(dev_b,b,N*sizeof(float),cudaMemcpyHostToDevice));
    dot<<<blockPerGrid,threadPerBlock>>>(dev_a,dev_b,dev_partial_c);
    CUDA_CHECK_RETURN(cudaMemcpy(partial_c,dev_partial_c,blockPerGrid*sizeof(float),cudaMemcpyDeviceToHost));
    c=0;
    for(int i=0;i<blockPerGrid;i++)
    {
    c += partial_c[i];
    }
    #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
    cout<<"Does GPU value "<<c<<" = "<<2*sum_squares((float)(N-1))<<endl;
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_partial_c);
    free(a);
    free(b);
    free(partial_c);
    return 0;
    }

    static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
    {
    if (err == cudaSuccess)
    return;
    std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
    exit (1);
    }

  • 相关阅读:
    delphi 的插件机制与自动更新
    delphi 的 ORM 框架
    canner CMS 系统 (公司在台湾) https://www.canner.io/
    区块链 ---- 数字货币交易
    BIM平台 http://gzcd.bim001.cn
    TreeGrid 控件集 :delphi 学习群 ---- 166637277 (Delphi学习交流与分享)
    UniGUI 如何进行 UniDBGrid 的单元 Cell 的计算 ?
    国产 WEB UI 框架 (收费)-- Quick UI,Mini UI
    iOS尽量不要在viewWillDisappear:方法中移除通知
    Tips:取消UICollectionView的隐式动画
  • 原文地址:https://www.cnblogs.com/shrimp-can/p/5046664.html
Copyright © 2020-2023  润新知