• GPGPU OpenCL Reduction操作与group同步


    Reduction操作规约操作就是由多个数生成一个数,如求最大值、最小值、向量点积、求和等操作,都属于这一类操作。

    有大量数据的情况下,使用GPU进行任务并行与数据并行,可以收到可好的效果。

    group同步:OpenCL只提供了工作组内的各线程之间的同步机制,并没有提供所有线程的同步。提供组内item-work同步的方法:

      void barrier (cl_mem_fence_flags flags) 

      参数说明:cl_mem_fence_flags 可以取CLK_LOCAL_MEM_FENCE、CLK_GLOBAL_MEM_FENCE

      函数说明:(1)一个work-group中所有work-item遇到barrier方法,都要等待其他work-item也到达该语句,才能执行后面的程序;

            (2)还可以组内的work-item对local or global memory的顺序读写操作。

    如下图中每个大框表示任务并行、每个group线程;框中的计算是数据并行、每个item-work线程:

    作为练习,给出个完整的使用OpenCL计算整数序列求和,在数据并行中使用Local Memory 加速,group组内并行同步使用CLK_LOCAL_MEM_FENCE

    程序实例(整数序列求和):

    1.核函数(Own_Reduction_Kernels.cl):

     1 __kernel
     2 void 
     3 reduce(__global uint4* input, __global uint4* output, int NUM)
     4 {    
     5     NUM = NUM / 4;    //每四个数为一个整体uint4。
     6     unsigned int tid = get_local_id(0);
     7     unsigned int localSize = get_local_size(0);
     8     unsigned int globalSize = get_global_size(0);
     9 
    10     uint4 res=(uint4){0,0,0,0};
    11     __local uint4 resArray[64];
    12 
    13     
    14     unsigned int i = get_global_id(0);
    15     while(i < NUM)
    16     {
    17         res+=input[i];
    18         i+=globalSize;
    19     }
    20     resArray[tid]=res;    //将每个work-item计算结果保存到对应__local memory中
    21     barrier(CLK_LOCAL_MEM_FENCE);
    22 
    23     // do reduction in shared mem
    24     for(unsigned int s = localSize >> 1; s > 0; s >>= 1) 
    25     {
    26         if(tid < s) 
    27         {
    28             resArray[tid] += resArray[tid + s];
    29         }
    30         barrier(CLK_LOCAL_MEM_FENCE);
    31     }
    32 
    33     // write result for this block to global mem
    34     if(tid == 0) 
    35         output[get_group_id(0)] = resArray[0];
    36 }

    2.tool.h 、tool.cpp

     见:http://www.cnblogs.com/xudong-bupt/p/3582780.html 

    3.Reduction.cpp

      1 #include <CL/cl.h>
      2 #include "tool.h"
      3 #include <string.h>
      4 #include <stdio.h>
      5 #include <stdlib.h>
      6 #include <iostream>
      7 #include <string>
      8 #include <fstream>
      9 using namespace std;
     10 
     11 int isVerify(int NUM,int groupNUM,int *res)    //校验结果
     12 {
     13        int sum1 = (NUM+1)*NUM/2;
     14     int sum2 = 0;
     15     for(int i = 0;i < groupNUM*4; i++)
     16         sum2 += res[i];
     17     if(sum1 == sum2)
     18         return 0;
     19     return -1;
     20 }
     21 
     22 void isStatusOK(cl_int status)    //判断状态码
     23 {
     24     if(status == CL_SUCCESS)
     25         cout<<"RIGHT"<<endl;
     26     else
     27         cout<<"ERROR"<<endl;
     28 }
     29 
     30 int main(int argc, char* argv[])
     31 {
     32     cl_int    status;
     33     /**Step 1: Getting platforms and choose an available one(first).*/
     34     cl_platform_id platform;
     35     getPlatform(platform);
     36 
     37     /**Step 2:Query the platform and choose the first GPU device if has one.*/
     38     cl_device_id *devices=getCl_device_id(platform);
     39 
     40     /**Step 3: Create context.*/
     41     cl_context context = clCreateContext(NULL,1, devices,NULL,NULL,NULL);
     42 
     43     /**Step 4: Creating command queue associate with the context.*/
     44     cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
     45 
     46     /**Step 5: Create program object */
     47     const char *filename = "Own_Reduction_Kernels.cl";
     48     string sourceStr;
     49     status = convertToString(filename, sourceStr);
     50     const char *source = sourceStr.c_str();
     51     size_t sourceSize[] = {strlen(source)};
     52     cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
     53 
     54     /**Step 6: Build program. */
     55     status=clBuildProgram(program, 1,devices,NULL,NULL,NULL);
     56 
     57     /**Step 7: Initial input,output for the host and create memory objects for the kernel*/
     58     int NUM=25600;    //6400*4
     59     size_t global_work_size[1] = {640};  ///
     60     size_t local_work_size[1]={64};    ///256 PE
     61     size_t groupNUM=global_work_size[0]/local_work_size[0];
     62     int* input = new int[NUM];
     63     for(int i=0;i<NUM;i++)
     64         input[i]=i+1;
     65     int* output = new int[(global_work_size[0]/local_work_size[0])*4];
     66 
     67     cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, (NUM) * sizeof(int),(void *) input, NULL);
     68     cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY , groupNUM*4* sizeof(int), NULL, NULL);
     69 
     70     /**Step 8: Create kernel object */
     71     cl_kernel kernel = clCreateKernel(program,"reduce", NULL);
     72 
     73     /**Step 9: Sets Kernel arguments.*/
     74     status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
     75     status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
     76     status = clSetKernelArg(kernel, 2, sizeof(int), &NUM);
     77 
     78     /**Step 10: Running the kernel.*/
     79     cl_event enentPoint;
     80     status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &enentPoint);
     81     clWaitForEvents(1,&enentPoint); ///wait
     82     clReleaseEvent(enentPoint);
     83     isStatusOK(status);
     84             
     85     /**Step 11: Read the cout put back to host memory.*/
     86     status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0,groupNUM*4 * sizeof(int), output, 0, NULL, NULL);
     87     isStatusOK(status);
     88     if(isVerify(NUM, groupNUM ,output) == 0)
     89         cout<<"The result is right!!!"<<endl;
     90     else
     91         cout<<"The result is wrong!!!"<<endl;
     92 
     93     /**Step 12: Clean the resources.*/
     94     status = clReleaseKernel(kernel);//*Release kernel.
     95     status = clReleaseProgram(program);    //Release the program object.
     96     status = clReleaseMemObject(inputBuffer);//Release mem object.
     97     status = clReleaseMemObject(outputBuffer);
     98     status = clReleaseCommandQueue(commandQueue);//Release  Command queue.
     99     status = clReleaseContext(context);//Release context.
    100 
    101     free(input);
    102     free(output);
    103     free(devices);
    104     return 0;
    105 }
    View Code
  • 相关阅读:
    Shell编程(一)为什么使用Shell编程
    ALSA驱动Debian声卡
    Shell编程(五)find与grep命令简介及正则表达式
    Shell编程(三)控制结构及函数
    初识Linux程序
    Gentoo的哲学
    学习Emacs
    Shell编程(二)Shell基本语法
    第一杯咖啡在Debian 上安装Java环境
    Fvwm 笔记
  • 原文地址:https://www.cnblogs.com/xudong-bupt/p/3586518.html
Copyright © 2020-2023  润新知