简介
参考:https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
NVIDIA 官方有一个PPT是介绍reduce sum,就是对数组进行求和。这个在串行程序里面非常简单的程序,在并行里面实现却有很多的技巧。PPT的最后给出了最终优化的版本,非常值得学习,但是网上几乎没有运行这个程序的demo。这里给出一个简单的demo程序。
完整代码
#include <cuda_runtime_api.h> #include <cuda.h> #include <device_functions.h> #include <stdio.h> #include <math.h> #include <string.h> #include <time.h> #define CUDA_ERROR_CHECK #define WARP_SIZE 32 #define MAX_THREAD 512 #define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ ) inline void __cudaCheckError( const char *file, const int line ) { #ifdef CUDA_ERROR_CHECK cudaError err = cudaGetLastError(); if ( cudaSuccess != err ) { fprintf( stderr, "cudaCheckError() failed at %s:%i : %s ", file, line, cudaGetErrorString( err ) ); exit( -1 ); } // More careful checking. However, this will affect performance. // Comment away if needed. err = cudaDeviceSynchronize(); if( cudaSuccess != err ) { fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s ", file, line, cudaGetErrorString( err ) ); exit( -1 ); } #endif return; } __device__ void warpReduce(volatile int *sdata, unsigned int tid, int blockSize) { if (blockSize >= 64) {sdata[tid] += sdata[tid + 32];} if (blockSize >= 32) {sdata[tid] += sdata[tid + 16];} if (blockSize >= 16) {sdata[tid] += sdata[tid + 8];} if (blockSize >= 8) {sdata[tid] += sdata[tid + 4];} if (blockSize >= 4) {sdata[tid] += sdata[tid + 2];} if (blockSize >= 2) {sdata[tid] += sdata[tid + 1];} } __global__ void reduce6(int *g_idata, int *g_odata, unsigned int n, int blockSize) { extern __shared__ int sdata[]; unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*(blockSize*2) + tid; unsigned int gridSize = blockSize*2*gridDim.x; //suit for the grid_dim.y > 1 sdata[tid] = 0;//set the shared memory to zero is needed while (i < n) { sdata[tid] += g_idata[i]; if ((i+blockSize) < n) sdata[tid] += g_idata[i+blockSize]; i += gridSize; } __syncthreads(); if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); } if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); } if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); } if (tid < 32) warpReduce(sdata, tid, blockSize); if (tid == 0) g_odata[blockIdx.x] = sdata[0]; } unsigned int calc_next_32_times(unsigned int x) { unsigned int next_pow_32_times = WARP_SIZE; for(int i=2; next_pow_32_times < x; i++) next_pow_32_times = WARP_SIZE*i; return next_pow_32_times; } int configure_reduce_sum(dim3 &grid_dim, dim3 &block_dim, unsigned int next_pow_32_times) { if(next_pow_32_times < MAX_THREAD) { block_dim = dim3(next_pow_32_times, 1, 1); grid_dim = dim3(1,1,1); } else { int grid_x = ceil((double) next_pow_32_times / MAX_THREAD); block_dim = dim3(MAX_THREAD, 1, 1); grid_dim = dim3(grid_x,1,1); } return 0; } extern "C" long API_cuda_reduce_sum(int *arr, unsigned int total_num) { int *d_arr, *d_out_arr, *h_out_arr; unsigned int next_pow_32_times = calc_next_32_times(total_num); cudaMalloc(&d_arr, next_pow_32_times * sizeof(int)); cudaMemset(d_arr, 0, next_pow_32_times * sizeof(int)); dim3 block_dim, grid_dim; configure_reduce_sum(grid_dim, block_dim, next_pow_32_times); int block_num = grid_dim.x * grid_dim.y * grid_dim.z; cudaMalloc(&d_out_arr, block_num * sizeof(int)); cudaMemset(d_out_arr, 0, block_num * sizeof(int)); CudaCheckError(); cudaMemcpy(d_arr, arr, sizeof(int) * total_num, cudaMemcpyHostToDevice); reduce6<<<grid_dim, block_dim, block_dim.x*sizeof(int)>>>(d_arr, d_out_arr, total_num, block_dim.x); CudaCheckError(); cudaDeviceSynchronize(); h_out_arr = new int[block_num]; memset(h_out_arr, 0, sizeof(int)*block_num); cudaMemcpy(h_out_arr, d_out_arr, sizeof(int) * block_num, cudaMemcpyDeviceToHost); long ret=0; for(int i=0; i<block_num; i++) { ret += h_out_arr[i]; } cudaFree(d_arr); cudaFree(d_out_arr); delete []h_out_arr; return ret; } int main(int argc, char **argv) { int arr[] = {1,2,3,4,4,5,5,6,7,3,2,3,1,1,1,1,1,4,1,2,3,4,4,5,5,6,7,3,2,3,1,1,1,1,1,4,1,2,3,4,4,5,5,6,7,3,2,3,1,1,1,1,1,4,1,2,3,4,4,5,5,6,7,3,2,3,1,1,1,1,1,4,1,1,1,4,1,1,1,4}; //80 int arr_len = sizeof(arr)/sizeof(arr[0]); int times = 5000000; int *big_arr = new int[times*arr_len]; long long sum = 0; clock_t start, finish; for(int i=0; i<times; i++) { memcpy((void*)(big_arr + (i*arr_len)), (void*)arr, sizeof(int)*arr_len); } start = clock(); for(int i=0; i<arr_len*times; i++) { sum += big_arr[i]; } finish = clock(); printf("arr len %ld ", arr_len*times); printf("serial sum of arr: %ld, cost time: %.4f s ", sum, (double) (finish - start) / CLOCKS_PER_SEC); start = clock(); int cuda_sum = API_cuda_reduce_sum(big_arr, arr_len*times); finish = clock(); printf("cuda sum of arr: %ld, cost time: %.4f s ", cuda_sum, (double) (finish - start) / CLOCKS_PER_SEC); return 0; }
要实现完整的求和需要递归的调用kernel函数,这里只实现了第一层的reduce,也就是每个block得到一个sum,最后对每个block的sum串行的求和得到最终的sum。
编译运行
nvcc cuda_reduce_sum.c -o cuda_reduce ./cuda_reduce
测试
程序输出:
arr len 400000000 serial sum of arr: 1150000000, cost time: 0.8800 cuda sum of arr: 1150000000, cost time: 0.5500
测试过程中发现,如果数组较小,串行的求和比cuda的方式更快。因为,使用cuda求和,需要进行内存搬运和kernel函数的launch等费时操作。所以在数据量较少时,使用串行求和的效率更高。