• 数组的并行求和-cuda实现


    简介

    参考:https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf

    NVIDIA 官方有一个PPT是介绍reduce sum,就是对数组进行求和。这个在串行程序里面非常简单的程序,在并行里面实现却有很多的技巧。PPT的最后给出了最终优化的版本,非常值得学习,但是网上几乎没有运行这个程序的demo。这里给出一个简单的demo程序。

    完整代码

    #include <cuda_runtime_api.h>
    #include <cuda.h>
    #include <device_functions.h>
    #include <stdio.h>
    #include <math.h>
    #include <string.h>
    #include <time.h>
    
    #define CUDA_ERROR_CHECK
    #define WARP_SIZE 32
    #define MAX_THREAD 512
    #define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
    
    inline void __cudaCheckError( const char *file, const int line )
    {
    #ifdef CUDA_ERROR_CHECK
        cudaError err = cudaGetLastError();
        if ( cudaSuccess != err )
        {
            fprintf( stderr, "cudaCheckError() failed at %s:%i : %s
    ",
                     file, line, cudaGetErrorString( err ) );
            exit( -1 );
        }
    
        // More careful checking. However, this will affect performance.
        // Comment away if needed.
        err = cudaDeviceSynchronize();
        if( cudaSuccess != err )
        {
            fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s
    ",
                     file, line, cudaGetErrorString( err ) );
            exit( -1 );
        }
    #endif
    
        return;
    }
    
    __device__ void warpReduce(volatile int *sdata, unsigned int tid, int blockSize)
    {
        if (blockSize >= 64) {sdata[tid] += sdata[tid + 32];}
        if (blockSize >= 32) {sdata[tid] += sdata[tid + 16];}
        if (blockSize >= 16) {sdata[tid] += sdata[tid + 8];}
        if (blockSize >= 8) {sdata[tid] += sdata[tid + 4];}
        if (blockSize >= 4) {sdata[tid] += sdata[tid + 2];}
        if (blockSize >= 2) {sdata[tid] += sdata[tid + 1];}
    }
    
    __global__ void reduce6(int *g_idata, int *g_odata, unsigned int n, int blockSize) 
    {
        extern __shared__ int sdata[];
        
        unsigned int tid = threadIdx.x;
        unsigned int i = blockIdx.x*(blockSize*2) + tid; 
        unsigned int gridSize = blockSize*2*gridDim.x; //suit for the grid_dim.y > 1
    
        sdata[tid] = 0;//set the shared memory to zero is needed
        while (i < n) 
        { 
            sdata[tid] += g_idata[i];
            if ((i+blockSize) < n)
                sdata[tid] += g_idata[i+blockSize];
            
            i += gridSize; 
        }
         __syncthreads();
        if (blockSize >= 512) 
        { 
            if (tid < 256)
            { 
                sdata[tid] += sdata[tid + 256]; 
            }
            __syncthreads(); 
        } 
        if (blockSize >= 256) 
        { 
            if (tid < 128) 
            { 
                sdata[tid] += sdata[tid + 128]; 
            }
            __syncthreads(); 
        } 
        if (blockSize >= 128)
        { 
            if (tid < 64)
            { 
                sdata[tid] += sdata[tid + 64]; 
            } 
            __syncthreads(); 
        }
        if (tid < 32) warpReduce(sdata, tid, blockSize);
        if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 
    }
    
    unsigned int calc_next_32_times(unsigned int x)
    {
        unsigned int next_pow_32_times = WARP_SIZE;
        for(int i=2; next_pow_32_times < x; i++)
            next_pow_32_times = WARP_SIZE*i;
        
        return next_pow_32_times;
    }
    
    int configure_reduce_sum(dim3 &grid_dim, dim3 &block_dim, unsigned int next_pow_32_times)
    {
        if(next_pow_32_times < MAX_THREAD)
        {
            block_dim = dim3(next_pow_32_times, 1, 1);
            grid_dim = dim3(1,1,1);
        }
        else
        {
            int grid_x = ceil((double) next_pow_32_times / MAX_THREAD);
            block_dim = dim3(MAX_THREAD, 1, 1);
            grid_dim = dim3(grid_x,1,1);
        }
    
        return 0;
    }
    
    extern "C" 
    long API_cuda_reduce_sum(int *arr, unsigned int total_num)
    {
        int *d_arr, *d_out_arr, *h_out_arr;
        unsigned int next_pow_32_times = calc_next_32_times(total_num);
    
        cudaMalloc(&d_arr, next_pow_32_times * sizeof(int));
        cudaMemset(d_arr, 0, next_pow_32_times * sizeof(int));
        
        dim3 block_dim, grid_dim;
        configure_reduce_sum(grid_dim, block_dim, next_pow_32_times);
        
        int block_num = grid_dim.x * grid_dim.y * grid_dim.z;
       
        cudaMalloc(&d_out_arr, block_num * sizeof(int));
        cudaMemset(d_out_arr, 0, block_num * sizeof(int));
        CudaCheckError();
    
        cudaMemcpy(d_arr, arr, sizeof(int) * total_num, cudaMemcpyHostToDevice);
        
        reduce6<<<grid_dim, block_dim, block_dim.x*sizeof(int)>>>(d_arr, d_out_arr, total_num, block_dim.x);
        CudaCheckError();
        cudaDeviceSynchronize();
    
        h_out_arr = new int[block_num];
        memset(h_out_arr, 0, sizeof(int)*block_num);
        cudaMemcpy(h_out_arr, d_out_arr, sizeof(int) * block_num, cudaMemcpyDeviceToHost);
    
        long ret=0;
        for(int i=0; i<block_num; i++)
        {
            ret += h_out_arr[i];
        }
    
        cudaFree(d_arr);
        cudaFree(d_out_arr);
        delete []h_out_arr;
    
        return ret;
    
    }
    
    int main(int argc, char **argv)
    {
        int arr[] = {1,2,3,4,4,5,5,6,7,3,2,3,1,1,1,1,1,4,1,2,3,4,4,5,5,6,7,3,2,3,1,1,1,1,1,4,1,2,3,4,4,5,5,6,7,3,2,3,1,1,1,1,1,4,1,2,3,4,4,5,5,6,7,3,2,3,1,1,1,1,1,4,1,1,1,4,1,1,1,4}; //80
        int arr_len = sizeof(arr)/sizeof(arr[0]);
        int times = 5000000;
        int *big_arr = new int[times*arr_len];
        long long sum = 0;
        clock_t start, finish;
    
        for(int i=0; i<times; i++)
        {
            memcpy((void*)(big_arr + (i*arr_len)), (void*)arr, sizeof(int)*arr_len);
        }
    
        start = clock();
        for(int i=0; i<arr_len*times; i++)
        {  
            sum += big_arr[i];
        }
        finish = clock();
        printf("arr len %ld
    ", arr_len*times);
        printf("serial sum of arr: %ld, cost time: %.4f s
    ", sum,  (double) (finish - start) / CLOCKS_PER_SEC);
    
        start = clock();
        int cuda_sum = API_cuda_reduce_sum(big_arr, arr_len*times);
        finish = clock();
    
        printf("cuda sum of arr: %ld, cost time: %.4f s
    ", cuda_sum, (double) (finish - start) / CLOCKS_PER_SEC);
        return 0;
    
    }

    要实现完整的求和需要递归的调用kernel函数,这里只实现了第一层的reduce,也就是每个block得到一个sum,最后对每个block的sum串行的求和得到最终的sum。

    编译运行

    nvcc cuda_reduce_sum.c -o cuda_reduce
    
    ./cuda_reduce
    

      

    测试

    程序输出:

    arr len 400000000
    
    serial sum of arr: 1150000000, cost time: 0.8800
    
    cuda sum of arr: 1150000000, cost time: 0.5500

    测试过程中发现,如果数组较小,串行的求和比cuda的方式更快。因为,使用cuda求和,需要进行内存搬运和kernel函数的launch等费时操作。所以在数据量较少时,使用串行求和的效率更高。

     
  • 相关阅读:
    超棒的微软Metro风格Logo设计
    免费资源:Polaris UI套件 + Linecons图标集(AI, PDF, PNG, PSD, SVG)
    11套免费的wordpress模板主题
    SASS(Syntactically Awesome Stylesheets Sass)绝对新手入门教程
    分享网页加载速度优化的一些技巧?
    帮助开发者快速创建响应式布局的Boilerplate Responsive Boilerplate
    免费素材: Retina Glyph图标集 (包含100个AI & PNG格式的图标)
    响应式的前端框架 Groundwork
    帮助你搜索免费矢量,图标和PSD的搜索引擎 Freepik
    Toolbar.Js 帮助你创建提示风格的工具条jQuery插件
  • 原文地址:https://www.cnblogs.com/walter-xh/p/11985065.html
Copyright © 2020-2023  润新知