• CUDA实例练习(八):原子操作(直方图)


          直方图概念:给定一个包含一组元素的数据集,直方图表示每个元素的出现频率。

    一、在CPU上计算直方图

     1 #include "book.h"
     2 #include <stdio.h>
     3 #include <cuda_runtime.h>
     4 #include <device_launch_parameters.h>
     5 #include <time.h>
     6 
     7 #define SIZE    (100*1024*1024)
     8 
     9 int main(void) {
    10     unsigned char *buffer =
    11         (unsigned char*)big_random_block(SIZE);
    12 
    13     // capture the start time
    14     clock_t         start, stop;
    15     start = clock();
    16 
    17     unsigned int    histo[256];
    18     for (int i = 0; i<256; i++)
    19         histo[i] = 0;
    20 
    21     for (int i = 0; i < SIZE; i++)
    22         histo[buffer[i]]++;
    23     stop = clock();
    24     float   elapsedTime = (float)(stop - start) /
    25         (float)CLOCKS_PER_SEC * 1000.0f;
    26     printf("Time to generate:  %3.1f ms
    ", elapsedTime);
    27 
    28     long histoCount = 0;
    29     for (int i = 0; i<256; i++) {
    30         histoCount += histo[i];
    31     }
    32     
    33     printf("Histogram Sum:  %ld
    ", histoCount);
    34 
    35     free(buffer);
    36     return 0;
    37 }

    二、在GPU上使用全局内存原子操作计算直方图

     1 #include <stdio.h>
     2 #include <cuda_runtime.h>
     3 #include <device_launch_parameters.h>
     4 #include "book.h"
     5 #include "gpu_anim.h"
     6 #define SIZE (100*1024*1024)
     7 
     8 __global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo){
     9     int i = threadIdx.x + blockIdx.x * blockDim.x;
    10     int stride = blockDim.x * gridDim.x;
    11     while (i < size){
    12         atomicAdd(&histo[buffer[i]], 1);
    13         i += stride;
    14     }
    15 }
    16 int main(void){
    17     unsigned char *buffer = (unsigned char*)big_random_block(SIZE);
    18     /*测量执行性能,初始化计时事件*/
    19     cudaEvent_t start, stop;
    20     HANDLE_ERROR(cudaEventCreate(&start));
    21     HANDLE_ERROR(cudaEventCreate(&stop));
    22     HANDLE_ERROR(cudaEventRecord(start, 0));
    23 
    24     //在GPU上为文件的数据分配内存
    25     unsigned char *dev_buffer;
    26     unsigned int *dev_histo;
    27     HANDLE_ERROR(cudaMalloc((void**)&dev_buffer, SIZE));
    28     HANDLE_ERROR(cudaMemcpy(dev_buffer, buffer, SIZE, cudaMemcpyHostToDevice));
    29     HANDLE_ERROR(cudaMalloc((void**)&dev_histo, 256 * sizeof(int)));
    30     HANDLE_ERROR(cudaMemset(dev_histo, 0, 256 * sizeof(int)));
    31     cudaDeviceProp prop;
    32     HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
    33     int blocks = prop.multiProcessorCount;
    34     histo_kernel << <blocks * 2, 256 >> >(dev_buffer, SIZE, dev_histo);
    35     unsigned int histo[256];
    36     HANDLE_ERROR(cudaMemcpy(histo, dev_histo, 256 * sizeof(int), cudaMemcpyDeviceToHost));
    37     //得到停止时间并显示计时结果
    38     HANDLE_ERROR(cudaEventRecord(stop, 0));
    39     HANDLE_ERROR(cudaEventSynchronize(stop));
    40     float elapsedTime;
    41     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
    42     printf("Time to generate: %3.1f ms
    ", elapsedTime);
    43 
    44     long histoCount = 0;
    45     for (int i = 0; i < 256; i++){
    46         histoCount += histo[i];
    47     }
    48     printf("Histogram Sum: %1d
    ", histoCount);
    49 
    50     //验证与基于CPU计算得到的结果是相同的
    51     for (int i = 0; i < SIZE; i++)
    52         histo[buffer[i]]--;
    53     for (int i = 0; i < 256; i++){
    54         if (histo[i] != 0)
    55             printf("Failure at %d!
    ", i);
    56     }
    57     //在程序结束时要释放已分配的CUDA事件,GPU内存和主机内存
    58     HANDLE_ERROR(cudaEventDestroy(start));
    59     HANDLE_ERROR(cudaEventDestroy(stop));
    60     cudaFree(dev_histo);
    61     cudaFree(dev_buffer);
    62     free(buffer);
    63     return 0;
    64 }

    在GPU上运行时间比在CPU上运行时间长,性能不理想。

    三、在GPU上使用共享内存原子操作计算直方图

     1 #include <stdio.h>
     2 #include <cuda_runtime.h>
     3 #include <device_launch_parameters.h>
     4 #include "book.h"
     5 #include "gpu_anim.h"
     6 #define SIZE (100*1024*1024)
     7 
     8 __global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo){
     9     __shared__ unsigned int temp[256];
    10     temp[threadIdx.x] = 0;
    11     __syncthreads();
    12     int i = threadIdx.x + blockIdx.x * blockDim.x;
    13     int offset = blockDim.x *gridDim.x;
    14     while (i<size){
    15         atomicAdd(&temp[buffer[i]], 1);
    16         i += offset;
    17     }
    18     __syncthreads();
    19     atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
    20 }
    21 int main(void){
    22     unsigned char *buffer = (unsigned char*)big_random_block(SIZE);
    23     /*测量执行性能,初始化计时事件*/
    24     cudaEvent_t start, stop;
    25     HANDLE_ERROR(cudaEventCreate(&start));
    26     HANDLE_ERROR(cudaEventCreate(&stop));
    27     HANDLE_ERROR(cudaEventRecord(start, 0));
    28 
    29     //在GPU上为文件的数据分配内存
    30     unsigned char *dev_buffer;
    31     unsigned int *dev_histo;
    32     HANDLE_ERROR(cudaMalloc((void**)&dev_buffer, SIZE));
    33     HANDLE_ERROR(cudaMemcpy(dev_buffer, buffer, SIZE, cudaMemcpyHostToDevice));
    34     HANDLE_ERROR(cudaMalloc((void**)&dev_histo, 256 * sizeof(int)));
    35     HANDLE_ERROR(cudaMemset(dev_histo, 0, 256 * sizeof(int)));
    36     cudaDeviceProp prop;
    37     HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
    38     int blocks = prop.multiProcessorCount;
    39     histo_kernel << <blocks * 2, 256 >> >(dev_buffer, SIZE, dev_histo);
    40     unsigned int histo[256];
    41     HANDLE_ERROR(cudaMemcpy(histo, dev_histo, 256 * sizeof(int), cudaMemcpyDeviceToHost));
    42     //得到停止时间并显示计时结果
    43     HANDLE_ERROR(cudaEventRecord(stop, 0));
    44     HANDLE_ERROR(cudaEventSynchronize(stop));
    45     float elapsedTime;
    46     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
    47     printf("Time to generate: %3.1f ms
    ", elapsedTime);
    48 
    49     long histoCount = 0;
    50     for (int i = 0; i < 256; i++){
    51         histoCount += histo[i];
    52     }
    53     printf("Histogram Sum: %1d
    ", histoCount);
    54 
    55     //验证与基于CPU计算得到的结果是相同的
    56     for (int i = 0; i < SIZE; i++)
    57         histo[buffer[i]]--;
    58     for (int i = 0; i < 256; i++){
    59         if (histo[i] != 0)
    60             printf("Failure at %d!
    ", i);
    61     }
    62     //在程序结束时要释放已分配的CUDA事件,GPU内存和主机内存
    63     HANDLE_ERROR(cudaEventDestroy(start));
    64     HANDLE_ERROR(cudaEventDestroy(stop));
    65     cudaFree(dev_histo);
    66     cudaFree(dev_buffer);
    67     free(buffer);
    68     return 0;
    69 }

    运行时间缩短很多,性能提升明显。

  • 相关阅读:
    MSDN for 2010的那些麻烦事
    CPtrList操作--插入,删除特定元素,删除全部
    如何绕过ORA00701错误和降低bootstrap对象的高水位
    ORA00600:[1112]内部错误&ROW CACHE ENQUEUE LOCK一例
    CRS5008: Invalid attribute value
    ORA00600[kjpsod1]&ORA44203错误一例
    runInstaller ignoreInternalDriverError
    RMAN CURSOR_SHARING=EXACT脚本
    SQL调优:带函数的谓词导致CBO Cardinality计算误差
    11g Real Application Testing:Database Replay使用方法
  • 原文地址:https://www.cnblogs.com/zhangshuwen/p/7346050.html
Copyright © 2020-2023  润新知