• CUDA编程札记







    const int N = 33 * 1024;
    const int threadsPerBlock = 256;
    const int blocksPerGrid =
                imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
    
    
    __global__ void dot( float *a, float *b, float *c ) {
        __shared__ float cache[threadsPerBlock];
        int tid = threadIdx.x + blockIdx.x * blockDim.x;
        int cacheIndex = threadIdx.x;
    
        float   temp = 0;
        while (tid < N) {
            temp += a[tid] * b[tid];
            tid += blockDim.x * gridDim.x;
        }
        
        // set the cache values
        cache[cacheIndex] = temp;
        
        // synchronize threads in this block
        __syncthreads();
    
        // for reductions, threadsPerBlock must be a power of 2
        // because of the following code
        int i = blockDim.x/2;
        while (i != 0) {
            if (cacheIndex < i)
                cache[cacheIndex] += cache[cacheIndex + i];
            __syncthreads();
            i /= 2;
        }
    
        if (cacheIndex == 0)
            c[blockIdx.x] = cache[0];
    }
    
    
    int main( void ) {
        float   *a, *b, c, *partial_c;
        float   *dev_a, *dev_b, *dev_partial_c;
    
        // allocate memory on the cpu side
        a = (float*)malloc( N*sizeof(float) );
        b = (float*)malloc( N*sizeof(float) );
        partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
    
        // allocate the memory on the GPU
        HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
                                  N*sizeof(float) ) );
        HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
                                  N*sizeof(float) ) );
        HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
                                  blocksPerGrid*sizeof(float) ) );
    
        // fill in the host memory with data
        for (int i=0; i<N; i++) {
            a[i] = i;
            b[i] = i*2;
        }
    
        // copy the arrays 'a' and 'b' to the GPU
        HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
                                  cudaMemcpyHostToDevice ) );
        HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
                                  cudaMemcpyHostToDevice ) ); 
    
        dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
                                                dev_partial_c );
    
        // copy the array 'c' back from the GPU to the CPU
        HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
                                  blocksPerGrid*sizeof(float),
                                  cudaMemcpyDeviceToHost ) );
    
        // finish up on the CPU side
        c = 0;
        for (int i=0; i<blocksPerGrid; i++) {
            c += partial_c[i];
        }
    
        #define sum_squares(x)  (x*(x+1)*(2*x+1)/6)
        printf( "Does GPU value %.6g = %.6g?
    ", c,
                 2 * sum_squares( (float)(N - 1) ) );
    
        // free memory on the gpu side
        HANDLE_ERROR( cudaFree( dev_a ) );
        HANDLE_ERROR( cudaFree( dev_b ) );
        HANDLE_ERROR( cudaFree( dev_partial_c ) );
    
        // free memory on the cpu side
        free( a );
        free( b );
        free( partial_c );
    }


    struct Lock {
        int *mutex;
        Lock( void ) {
            HANDLE_ERROR( cudaMalloc( (void**)&mutex,sizeof(int) ) );
            HANDLE_ERROR( cudaMemset( mutex, 0, sizeof(int) ) );
        }
        ~Lock( void ) {
            cudaFree( mutex );
        }
        __device__ void lock( void ) {
            while( atomicCAS( mutex, 0, 1 ) != 0 );
        }
        __device__ void unlock( void ) {
            atomicExch( mutex, 0 );
        }
    };

    #define imin(a,b) (a<b?a:b)
    
    const int N = 33 * 1024 * 1024;
    const int threadsPerBlock = 256;
    const int blocksPerGrid =
                imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
    
    __global__ void dot( Lock lock, float *a,
                         float *b, float *c ) {
        __shared__ float cache[threadsPerBlock];
        int tid = threadIdx.x + blockIdx.x * blockDim.x;
        int cacheIndex = threadIdx.x;
    
        float   temp = 0;
        while (tid < N) {
            temp += a[tid] * b[tid];
            tid += blockDim.x * gridDim.x;
        }
        
        // set the cache values
        cache[cacheIndex] = temp;
        
        // synchronize threads in this block
        __syncthreads();
    
        // for reductions, threadsPerBlock must be a power of 2
        // because of the following code
        int i = blockDim.x/2;
        while (i != 0) {
            if (cacheIndex < i)
                cache[cacheIndex] += cache[cacheIndex + i];
            __syncthreads();
            i /= 2;
        }
    
        if (cacheIndex == 0) {
            // wait until we get the lock
            lock.lock();
           // we have the lock at this point, update and release
            *c += cache[0];
            lock.unlock();
        }
    }
    
    
    int main( void ) {
        float   *a, *b, c = 0;
        float   *dev_a, *dev_b, *dev_c;
    
        // allocate memory on the cpu side
        a = (float*)malloc( N*sizeof(float) );
        b = (float*)malloc( N*sizeof(float) );
    
        // allocate the memory on the GPU
        HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
                                  N*sizeof(float) ) );
        HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
                                  N*sizeof(float) ) );
        HANDLE_ERROR( cudaMalloc( (void**)&dev_c,
                                  sizeof(float) ) );
    
        // fill in the host memory with data
        for (int i=0; i<N; i++) {
            a[i] = i;
            b[i] = i*2;
        }
    
        // copy the arrays 'a' and 'b' to the GPU
        HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
                                  cudaMemcpyHostToDevice ) );
        HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
                                  cudaMemcpyHostToDevice ) ); 
        HANDLE_ERROR( cudaMemcpy( dev_c, &c, sizeof(float),
                                  cudaMemcpyHostToDevice ) ); 
    
        Lock    lock;
        dot<<<blocksPerGrid,threadsPerBlock>>>( lock, dev_a,
                                                dev_b, dev_c );
    
        // copy c back from the GPU to the CPU
        HANDLE_ERROR( cudaMemcpy( &c, dev_c,
                                  sizeof(float),
                                  cudaMemcpyDeviceToHost ) );
    
        #define sum_squares(x)  (x*(x+1)*(2*x+1)/6)
        printf( "Does GPU value %.6g = %.6g?
    ", c,
                 2 * sum_squares( (float)(N - 1) ) );
    
        // free memory on the gpu side
        HANDLE_ERROR( cudaFree( dev_a ) );
        HANDLE_ERROR( cudaFree( dev_b ) );
        HANDLE_ERROR( cudaFree( dev_c ) );
    
        // free memory on the cpu side
        free( a );
        free( b );
    }
    

    __global__ void histo_kernel( unsigned char *buffer,
                                  long size,
                                  unsigned int *histo ) {
        // calculate the starting index and the offset to the next
        // block that each thread will be processing
        int i = threadIdx.x + blockIdx.x * blockDim.x;
        int stride = blockDim.x * gridDim.x;
        while (i < size) {
            atomicAdd( &histo[buffer[i]], 1 );
            i += stride;
        }
    }
    
    int main( void ) {
        unsigned char *buffer =
                         (unsigned char*)big_random_block( SIZE );
    
        // capture the start time
        // starting the timer here so that we include the cost of
        // all of the operations on the GPU.
        cudaEvent_t     start, stop;
        HANDLE_ERROR( cudaEventCreate( &start ) );
        HANDLE_ERROR( cudaEventCreate( &stop ) );
        HANDLE_ERROR( cudaEventRecord( start, 0 ) );
    
        // allocate memory on the GPU for the file's data
        unsigned char *dev_buffer;
        unsigned int *dev_histo;
        HANDLE_ERROR( cudaMalloc( (void**)&dev_buffer, SIZE ) );
        HANDLE_ERROR( cudaMemcpy( dev_buffer, buffer, SIZE,
                                  cudaMemcpyHostToDevice ) );
    
        HANDLE_ERROR( cudaMalloc( (void**)&dev_histo,
                                  256 * sizeof( int ) ) );
        HANDLE_ERROR( cudaMemset( dev_histo, 0,
                                  256 * sizeof( int ) ) );
    
        // kernel launch - 2x the number of mps gave best timing
        cudaDeviceProp  prop;
        HANDLE_ERROR( cudaGetDeviceProperties( &prop, 0 ) );
        int blocks = prop.multiProcessorCount;
        histo_kernel<<<blocks*2,256>>>( dev_buffer, SIZE, dev_histo );
        
        unsigned int    histo[256];
        HANDLE_ERROR( cudaMemcpy( histo, dev_histo,
                                  256 * sizeof( int ),
                                  cudaMemcpyDeviceToHost ) );
    
        // get stop time, and display the timing results
        HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
        HANDLE_ERROR( cudaEventSynchronize( stop ) );
        float   elapsedTime;
        HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
                                            start, stop ) );
        printf( "Time to generate:  %3.1f ms
    ", elapsedTime );
    
        long histoCount = 0;
        for (int i=0; i<256; i++) {
            histoCount += histo[i];
        }
        printf( "Histogram Sum:  %ld
    ", histoCount );
    
        // verify that we have the same counts via CPU
        for (int i=0; i<SIZE; i++)
            histo[buffer[i]]--;
        for (int i=0; i<256; i++) {
            if (histo[i] != 0)
                printf( "Failure at %d!  Off by %d
    ", i, histo[i] );
        }
    
        HANDLE_ERROR( cudaEventDestroy( start ) );
        HANDLE_ERROR( cudaEventDestroy( stop ) );
        cudaFree( dev_histo );
        cudaFree( dev_buffer );
        free( buffer );
        return 0;
    }
    


    __global__ void histo_kernel( unsigned char *buffer,
                                  long size,
                                  unsigned int *histo ) {
    
        // clear out the accumulation buffer called temp
        // since we are launched with 256 threads, it is easy
        // to clear that memory with one write per thread
        __shared__  unsigned int temp[256];
        temp[threadIdx.x] = 0;
        __syncthreads();
    
        // calculate the starting index and the offset to the next
        // block that each thread will be processing
        int i = threadIdx.x + blockIdx.x * blockDim.x;
        int stride = blockDim.x * gridDim.x;
        while (i < size) {
            atomicAdd( &temp[buffer[i]], 1 );
            i += stride;
        }
        // sync the data from the above writes to shared memory
        // then add the shared memory values to the values from
        // the other thread blocks using global memory
        // atomic adds
        // same as before, since we have 256 threads, updating the
        // global histogram is just one write per thread!
        __syncthreads();
        atomicAdd( &(histo[threadIdx.x]), temp[threadIdx.x] );
    }
    
    int main( void ) {
        unsigned char *buffer =
                         (unsigned char*)big_random_block( SIZE );
    
        // capture the start time
        // starting the timer here so that we include the cost of
        // all of the operations on the GPU.  if the data were
        // already on the GPU and we just timed the kernel
        // the timing would drop from 74 ms to 15 ms.  Very fast.
        cudaEvent_t     start, stop;
        HANDLE_ERROR( cudaEventCreate( &start ) );
        HANDLE_ERROR( cudaEventCreate( &stop ) );
        HANDLE_ERROR( cudaEventRecord( start, 0 ) );
    
        // allocate memory on the GPU for the file's data
        unsigned char *dev_buffer;
        unsigned int *dev_histo;
        HANDLE_ERROR( cudaMalloc( (void**)&dev_buffer, SIZE ) );
        HANDLE_ERROR( cudaMemcpy( dev_buffer, buffer, SIZE,
                                  cudaMemcpyHostToDevice ) );
    
        HANDLE_ERROR( cudaMalloc( (void**)&dev_histo,
                                  256 * sizeof( int ) ) );
        HANDLE_ERROR( cudaMemset( dev_histo, 0,
                                  256 * sizeof( int ) ) );
    
        // kernel launch - 2x the number of mps gave best timing
        cudaDeviceProp  prop;
        HANDLE_ERROR( cudaGetDeviceProperties( &prop, 0 ) );
        int blocks = prop.multiProcessorCount;
        histo_kernel<<<blocks*2,256>>>( dev_buffer,
                                        SIZE, dev_histo );
        
        unsigned int    histo[256];
        HANDLE_ERROR( cudaMemcpy( histo, dev_histo,
                                  256 * sizeof( int ),
                                  cudaMemcpyDeviceToHost ) );
    
        // get stop time, and display the timing results
        HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
        HANDLE_ERROR( cudaEventSynchronize( stop ) );
        float   elapsedTime;
        HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
                                            start, stop ) );
        printf( "Time to generate:  %3.1f ms
    ", elapsedTime );
    
        long histoCount = 0;
        for (int i=0; i<256; i++) {
            histoCount += histo[i];
        }
        printf( "Histogram Sum:  %ld
    ", histoCount );
    
        // verify that we have the same counts via CPU
        for (int i=0; i<SIZE; i++)
            histo[buffer[i]]--;
        for (int i=0; i<256; i++) {
            if (histo[i] != 0)
                printf( "Failure at %d!
    ", i );
        }
    
        HANDLE_ERROR( cudaEventDestroy( start ) );
        HANDLE_ERROR( cudaEventDestroy( stop ) );
        cudaFree( dev_histo );
        cudaFree( dev_buffer );
        free( buffer );
        return 0;
    }
    


    注:本文是作者对GPU高性能编程CUDA实战的学习总结。此书的代码可以在下面的链接下载,无需积分哦!

    http://download.csdn.net/detail/celerychen2009/6360573


  • 相关阅读:
    jar包和war包的介绍和区别
    java中getAttribute和getParameter的区别
    修改tomcat默认的编码方式
    jQuery遮罩插件jQuery.blockUI.js简介
    Sql Server 2008 Management studio安装教程
    评论字数限制和字数显示
    如何将表单元素封装
    DWR原理探秘
    linux命令详解:pgrep命令
    使用Nginx实现灰度发布
  • 原文地址:https://www.cnblogs.com/celerychen/p/3588191.html
Copyright © 2020-2023  润新知