• opencl(二十五)----双调排序


    参考:《opencl实战》 

    双调排序

    一个序列:进行升序排列

    6   1   4   5     7    2    3    8

    a、左右两部分别 升序、降序

    1  4  5   6      8  7  3  2

    b 、左右度应位置比较,小的左移

    1 4  3   2       8  7   5  6

    c、左右都整成升序

    1 2 3 4       5  6   7   8

    注:四个元素如何排序

     opencl 中可用代码如下:

       uint4 mask1 = (uint4)(1, 0, 3, 2);
       uint4 mask2 = (uint4)(2, 3, 0, 1);
       uint4 mask3 = (uint4)(3, 2, 1, 0);

       int4 add1 = (int4)(1, 1, 3, 3);
       int4 add2 = (int4)(2, 3, 2, 3);
       int4 add3 = (int4)(1, 2, 2, 3);
       int4 add4 = (int4)(4, 5, 6, 7);

    // float4* : input input1 input2 // dir : 0升 -1降 #define UP 0
    #define DOWN -1

    /* Sort elements in a vector */
    #define SORT_VECTOR(input, dir)                                  
       comp = input < shuffle(input, mask1) ^ dir;                   
       input = shuffle(input, as_uint4(comp + add1));                
       comp = input < shuffle(input, mask2) ^ dir;                   
       input = shuffle(input, as_uint4(comp * 2 + add2));            
       comp = input < shuffle(input, mask3) ^ dir;                   
       input = shuffle(input, as_uint4(comp + add3));                


    /* Sort elements between two vectors */
    #define SWAP_VECTORS(input1, input2, dir)                        
       temp = input1;                                                
       comp = (input1 < input2 ^ dir) * 4 + add4;                    
       input1 = shuffle2(input1, input2, as_uint4(comp));            
       input2 = shuffle2(input2, temp, as_uint4(comp));              

    demo:  对8个数值排序

      1 /****************************  kernel *******************************/
      2 #define UP 0
      3 #define DOWN -1
      4 
      5 /* Sort elements in a vector */
      6 #define SORT_VECTOR(input, dir)                                   
      7    comp = input < shuffle(input, mask1) ^ dir;                    
      8    input = shuffle(input, as_uint4(comp + add1));                 
      9    comp = input < shuffle(input, mask2) ^ dir;                    
     10    input = shuffle(input, as_uint4(comp * 2 + add2));             
     11    comp = input < shuffle(input, mask3) ^ dir;                    
     12    input = shuffle(input, as_uint4(comp + add3));                 
     13 
     14 /* Sort elements between two vectors */
     15 #define SWAP_VECTORS(input1, input2, dir)                         
     16    temp = input1;                                                 
     17    comp = (input1 < input2 ^ dir) * 4 + add4;                     
     18    input1 = shuffle2(input1, input2, as_uint4(comp));             
     19    input2 = shuffle2(input2, temp, as_uint4(comp));               
     20 
     21 __kernel void bsort8(__global float4 *data, int dir) {
     22 
     23    float4 input1, input2, temp;
     24    int4 comp;
     25 
     26    uint4 mask1 = (uint4)(1, 0, 3, 2);
     27    uint4 mask2 = (uint4)(2, 3, 0, 1);
     28    uint4 mask3 = (uint4)(3, 2, 1, 0);
     29 
     30    int4 add1 = (int4)(1, 1, 3, 3);
     31    int4 add2 = (int4)(2, 3, 2, 3);
     32    int4 add3 = (int4)(1, 2, 2, 3);
     33    int4 add4 = (int4)(4, 5, 6, 7);
     34 
     35    input1 = data[0];
     36    input2 = data[1];
     37 
     38    SORT_VECTOR(input1, UP)
     39    SORT_VECTOR(input2, DOWN)
     40 
     41    SWAP_VECTORS(input1, input2, dir)
     42 
     43    SORT_VECTOR(input1, dir)
     44    SORT_VECTOR(input2, dir)
     45 
     46    data[0] = input1;
     47    data[1] = input2;
     48 }
     49 
     50 
     51 /**************************************  主机程序  ***************************************/
     52 #define _CRT_SECURE_NO_WARNINGS
     53 #define PROGRAM_FILE "bsort8.cl"
     54 #define KERNEL_FUNC "bsort8"
     55 
     56 #define ASCENDING 0
     57 #define DESCENDING -1
     58 
     59 #include <stdio.h>
     60 #include <stdlib.h>
     61 #include <string.h>
     62 #include <time.h>
     63 
     64 #ifdef MAC
     65 #include <OpenCL/cl.h>
     66 #else
     67 #include <CL/cl.h>
     68 #endif
     69 
     70 /* Find a GPU or CPU associated with the first available platform */
     71 cl_device_id create_device() {
     72 
     73    cl_platform_id platform;
     74    cl_device_id dev;
     75    int err;
     76 
     77    /* Identify a platform */
     78    err = clGetPlatformIDs(1, &platform, NULL);
     79    if(err < 0) {
     80       perror("Couldn't identify a platform");
     81       exit(1);
     82    } 
     83 
     84    /* Access a device */
     85    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
     86    if(err == CL_DEVICE_NOT_FOUND) {
     87       err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
     88    }
     89    if(err < 0) {
     90       perror("Couldn't access any devices");
     91       exit(1);   
     92    }
     93 
     94    return dev;
     95 }
     96 
     97 /* Create program from a file and compile it */
     98 cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
     99 
    100    cl_program program;
    101    FILE *program_handle;
    102    char *program_buffer, *program_log;
    103    size_t program_size, log_size;
    104    int err;
    105 
    106    /* Read program file and place content into buffer */
    107    program_handle = fopen(filename, "r");
    108    if(program_handle == NULL) {
    109       perror("Couldn't find the program file");
    110       exit(1);
    111    }
    112    fseek(program_handle, 0, SEEK_END);
    113    program_size = ftell(program_handle);
    114    rewind(program_handle);
    115    program_buffer = (char*)malloc(program_size + 1);
    116    program_buffer[program_size] = '';
    117    fread(program_buffer, sizeof(char), program_size, program_handle);
    118    fclose(program_handle);
    119 
    120    /* Create program from file */
    121    program = clCreateProgramWithSource(ctx, 1, 
    122       (const char**)&program_buffer, &program_size, &err);
    123    if(err < 0) {
    124       perror("Couldn't create the program");
    125       exit(1);
    126    }
    127    free(program_buffer);
    128 
    129    /* Build program */
    130    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    131    if(err < 0) {
    132 
    133       /* Find size of log and print to std output */
    134       clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
    135             0, NULL, &log_size);
    136       program_log = (char*) malloc(log_size + 1);
    137       program_log[log_size] = '';
    138       clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
    139             log_size + 1, program_log, NULL);
    140       printf("%s
    ", program_log);
    141       free(program_log);
    142       exit(1);
    143    }
    144 
    145    return program;
    146 }
    147 
    148 int main() {
    149 
    150    /* Host/device data structures */
    151    cl_device_id device;
    152    cl_context context;
    153    cl_command_queue queue;
    154    cl_program program;
    155    cl_kernel kernel;
    156    cl_int i, err, dir, check;
    157 
    158    /* Data and buffers */
    159    float data[8];
    160    cl_mem data_buffer;
    161    
    162    /* Initialize data */
    163    data[0] = 3.0f; data[1] = 5.0f; data[2] = 4.0f; data[3] = 6.0f;
    164    data[4] = 0.0f; data[5] = 7.0f; data[6] = 2.0f; data[7] = 1.0f;
    165    printf("Input:  %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f
    ",
    166        data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
    167 
    168    /* Create a device and context */
    169    device = create_device();
    170    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    171    if(err < 0) {
    172       perror("Couldn't create a context");
    173       exit(1);   
    174    }
    175 
    176    /* Create a kernel */
    177    program = build_program(context, device, PROGRAM_FILE);
    178    kernel = clCreateKernel(program, KERNEL_FUNC, &err);
    179    if(err < 0) {
    180       perror("Couldn't create a kernel");
    181       exit(1);   
    182    };
    183 
    184    /* Create buffer */
    185    data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
    186          CL_MEM_COPY_HOST_PTR, sizeof(data), data, &err);
    187    if(err < 0) {
    188       perror("Couldn't create a buffer");
    189       exit(1);   
    190    };
    191 
    192    /* Create kernel argument */
    193    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer);
    194    if(err < 0) {
    195       printf("Couldn't set a kernel argument");
    196       exit(1);
    197    };
    198 
    199    /* Create kernel argument */
    200    dir = ASCENDING;
    201    err = clSetKernelArg(kernel, 1, sizeof(int), &dir);
    202    if(err < 0) {
    203       printf("Couldn't set a kernel argument");
    204       exit(1);
    205    };
    206 
    207    /* Create a command queue */
    208    queue = clCreateCommandQueue(context, device, 0, &err);
    209    if(err < 0) {
    210       perror("Couldn't create a command queue");
    211       exit(1);   
    212    };
    213 
    214    /* Enqueue kernel */
    215    err = clEnqueueTask(queue, kernel, 0, NULL, NULL); 
    216    if(err < 0) {
    217       perror("Couldn't enqueue the kernel");
    218       exit(1);   
    219    }
    220 
    221    /* Read and print the result */
    222    err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, 
    223       sizeof(data), &data, 0, NULL, NULL);
    224    if(err < 0) {
    225       perror("Couldn't read the buffer");
    226       exit(1);   
    227    }
    228    printf("Output: %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f
    ",
    229        data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
    230 
    231    /* Check the result */
    232    check = 1;
    233 
    234    /* Check ascending sort */
    235    if(dir == ASCENDING) {
    236       for(i=1; i<8; i++) {
    237          if(data[i] < data[i-1]) {
    238             check = 0;
    239             break;
    240          }
    241       }
    242    }
    243    /* Check descending sort */
    244    if(dir == DESCENDING) {
    245       for(i=1; i<8; i++) {
    246          if(data[i] > data[i-1]) {
    247             check = 0;
    248             break;
    249          }
    250       }
    251    }
    252 
    253    /* Display check result */
    254    if(check)
    255       printf("Bitonic sort succeeded.
    ");
    256    else
    257       printf("Bitonic sort failed.
    ");
    258 
    259    /* Deallocate resources */
    260    clReleaseMemObject(data_buffer);
    261    clReleaseKernel(kernel);
    262    clReleaseCommandQueue(queue);
    263    clReleaseProgram(program);
    264    clReleaseContext(context);
    265    return 0;
    266 }
    View Code

    一个通用示例

    主机程序:

    // 获取设备

    // 获取设备
    cl_device_id create_device() {
    
       cl_platform_id platform;
       cl_device_id dev;
       int err;
    
       /* Identify a platform */
       err = clGetPlatformIDs(1, &platform, NULL);
       if(err < 0) {
          perror("Couldn't identify a platform");
          exit(1);
       } 
    
       /* Access a device */
       err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
       if(err == CL_DEVICE_NOT_FOUND) {
          err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
       }
       if(err < 0) {
          perror("Couldn't access any devices");
          exit(1);   
       }
    
       return dev;
    }

    // 创建并编译cl_program

     1 // 创建并编译cl_program
     2 cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
     3 
     4    cl_program program;
     5    FILE *program_handle;
     6    char *program_buffer, *program_log;
     7    size_t program_size, log_size;
     8    int err;
     9 
    10    /* Read program file and place content into buffer */
    11    program_handle = fopen(filename, "r");
    12    if(program_handle == NULL) {
    13       perror("Couldn't find the program file");
    14       exit(1);
    15    }
    16    fseek(program_handle, 0, SEEK_END);
    17    program_size = ftell(program_handle);
    18    rewind(program_handle);
    19    program_buffer = (char*)malloc(program_size + 1);
    20    program_buffer[program_size] = '';
    21    fread(program_buffer, sizeof(char), program_size, program_handle);
    22    fclose(program_handle);
    23 
    24    /* Create program from file */
    25    program = clCreateProgramWithSource(ctx, 1, 
    26       (const char**)&program_buffer, &program_size, &err);
    27    if(err < 0) {
    28       perror("Couldn't create the program");
    29       exit(1);
    30    }
    31    free(program_buffer);
    32 
    33    /* Build program */
    34    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    35    if(err < 0) {
    36 
    37       /* Find size of log and print to std output */
    38       clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
    39             0, NULL, &log_size);
    40       program_log = (char*) malloc(log_size + 1);
    41       program_log[log_size] = '';
    42       clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
    43             log_size + 1, program_log, NULL);
    44       printf("%s
    ", program_log);
    45       free(program_log);
    46       exit(1);
    47    }
    48 
    49    return program;
    50 }
    View Code

    // main

    #禁止不安全的错误警告
    #define _CRT_SECURE_NO_WARNINGS  
    #define PROGRAM_FILE       "bsort.cl"
    #define BSORT_INIT         "bsort_init"
    #define BSORT_STAGE_0      "bsort_stage_0"
    #define BSORT_STAGE_N      "bsort_stage_n"
    #define BSORT_MERGE        "bsort_merge"
    #define BSORT_MERGE_LAST   "bsort_merge_last"
    
    /* Ascending: 0, Descending: -1 */
    #define DIRECTION 0
    #define NUM_FLOATS 1048576
    
    #include <math.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <time.h>       
    
    #ifdef MAC
    #include <OpenCL/cl.h>
    #else
    #include <CL/cl.h>
    #endif
    
    /* Find a GPU or CPU associated with the first available platform */
    cl_device_id create_device() ;
    
    /* Create program from a file and compile it */
    cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) ;
    int main() {
    
       /* Host/device data structures */
       cl_device_id device;
       cl_context context;
       cl_command_queue queue;
       cl_program program;
       cl_kernel kernel_init, kernel_stage_0, kernel_stage_n, kernel_merge,
             kernel_merge_last;
       cl_int i, err, check, direction;
    
       /* Data and buffers */
       float data[NUM_FLOATS];
       cl_mem data_buffer;
       cl_uint stage, high_stage, num_stages;
       size_t local_size, global_size;
    
       /* Initialize data */
       srand(time(NULL));
       for(i=0; i<NUM_FLOATS; i++) {
          data[i] = rand();
       }
    
       /* Create a device and context */
       device = create_device();
       context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
       if(err < 0) {
          perror("Couldn't create a context");
          exit(1);   
       }
    
       /* Build the program */
       program = build_program(context, device, PROGRAM_FILE);
    
       /* Create kernels */
       kernel_init = clCreateKernel(program, BSORT_INIT, &err);
       if(err < 0) {
          perror("Couldn't create the initial kernel");
          exit(1);   
       };
       kernel_stage_0 = clCreateKernel(program, BSORT_STAGE_0, &err);
       if(err < 0) {
          perror("Couldn't create the stage_0 kernel");
          exit(1);   
       };
       kernel_stage_n = clCreateKernel(program, BSORT_STAGE_N, &err);
       if(err < 0) {
          perror("Couldn't create the stage_n kernel");
          exit(1);   
       };
       kernel_merge = clCreateKernel(program, BSORT_MERGE, &err);
       if(err < 0) {
          perror("Couldn't create the merge kernel");
          exit(1);   
       };
       kernel_merge_last = clCreateKernel(program, BSORT_MERGE_LAST, &err);
       if(err < 0) {
          perror("Couldn't create the merge_last kernel");
          exit(1);   
       };
    
       /* Determine maximum work-group size */
       // 获取工作组中工作项的 数量限制
       err = clGetKernelWorkGroupInfo(kernel_init, device, CL_KERNEL_WORK_GROUP_SIZE,
          sizeof(local_size), &local_size, NULL);
       if(err < 0) {
          perror("Couldn't find the maximum work-group size");
          exit(1);   
       };
       local_size = (int)pow(2, trunc(log2(local_size))); //函数 TRUNC 直接去除数字的小数部分
    
    
       /* Create buffer */
       data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
             CL_MEM_COPY_HOST_PTR, sizeof(data), data, &err);
       if(err < 0) {
          perror("Couldn't create a buffer");
          exit(1);   
       };
    
       /* Create kernel argument */
       err = clSetKernelArg(kernel_init, 0, sizeof(cl_mem), &data_buffer);
       err |= clSetKernelArg(kernel_stage_0, 0, sizeof(cl_mem), &data_buffer);
       err |= clSetKernelArg(kernel_stage_n, 0, sizeof(cl_mem), &data_buffer);
       err |= clSetKernelArg(kernel_merge, 0, sizeof(cl_mem), &data_buffer);
       err |= clSetKernelArg(kernel_merge_last, 0, sizeof(cl_mem), &data_buffer);
       if(err < 0) {
          printf("Couldn't set a kernel argument");
          exit(1);
       };
    
       /* Create kernel argument */
       err = clSetKernelArg(kernel_init, 1, 8*local_size*sizeof(float), NULL);
       err |= clSetKernelArg(kernel_stage_0, 1, 8*local_size*sizeof(float), NULL);
       err |= clSetKernelArg(kernel_stage_n, 1, 8*local_size*sizeof(float), NULL);
       err |= clSetKernelArg(kernel_merge, 1, 8*local_size*sizeof(float), NULL);
       err |= clSetKernelArg(kernel_merge_last, 1, 8*local_size*sizeof(float), NULL);
       if(err < 0) {
          printf("Couldn't set a kernel argument");
          exit(1);
       };
    
       /* Create a command queue */
       // 创建命令队列
       queue = clCreateCommandQueue(context, device, 0, &err);
       if(err < 0) {
          perror("Couldn't create a command queue");
          exit(1);   
       };
    
       /* Enqueue initial sorting kernel */
       global_size = NUM_FLOATS/8;
       if(global_size < local_size) {
          local_size = global_size;
       }
       err = clEnqueueNDRangeKernel(queue, kernel_init, 1, NULL, &global_size, 
             &local_size, 0, NULL, NULL); 
       if(err < 0) {
          perror("Couldn't enqueue the kernel");
          exit(1);   
       }
    
       /* Execute further stages */
       num_stages = global_size/local_size;
       for(high_stage = 2; high_stage < num_stages; high_stage <<= 1) {
    
          err = clSetKernelArg(kernel_stage_0, 2, sizeof(int), &high_stage);      
          err |= clSetKernelArg(kernel_stage_n, 3, sizeof(int), &high_stage);
          if(err < 0) {
             printf("Couldn't set a kernel argument");
             exit(1);
          };
    
          for(stage = high_stage; stage > 1; stage >>= 1) {
    
             err = clSetKernelArg(kernel_stage_n, 2, sizeof(int), &stage);
             if(err < 0) {
                printf("Couldn't set a kernel argument");
                exit(1);
             };
    
             err = clEnqueueNDRangeKernel(queue, kernel_stage_n, 1, NULL, 
                   &global_size, &local_size, 0, NULL, NULL); 
             if(err < 0) {
                perror("Couldn't enqueue the kernel");
                exit(1);   
             }
          }
    
          err = clEnqueueNDRangeKernel(queue, kernel_stage_0, 1, NULL, 
                &global_size, &local_size, 0, NULL, NULL); 
          if(err < 0) {
             perror("Couldn't enqueue the kernel");
             exit(1);   
          }
       }
    
       /* Set the sort direction */
       direction = DIRECTION;
       err = clSetKernelArg(kernel_merge, 3, sizeof(int), &direction);
       err |= clSetKernelArg(kernel_merge_last, 2, sizeof(int), &direction);
       if(err < 0) {
          printf("Couldn't set a kernel argument");
          exit(1);
       };
    
       /* Perform the bitonic merge */
       for(stage = num_stages; stage > 1; stage >>= 1) {
    
          err = clSetKernelArg(kernel_merge, 2, sizeof(int), &stage);
          if(err < 0) {
             printf("Couldn't set a kernel argument");
             exit(1);
          };
    
          err = clEnqueueNDRangeKernel(queue, kernel_merge, 1, NULL, 
                &global_size, &local_size, 0, NULL, NULL); 
          if(err < 0) {
             perror("Couldn't enqueue the kernel");
             exit(1);   
          }
       }
       err = clEnqueueNDRangeKernel(queue, kernel_merge_last, 1, NULL, 
             &global_size, &local_size, 0, NULL, NULL); 
       if(err < 0) {
          perror("Couldn't enqueue the kernel");
          exit(1);   
       }
    
       /* Read the result */
       err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, 
          sizeof(data), &data, 0, NULL, NULL);
       if(err < 0) {
          perror("Couldn't read the buffer");
          exit(1);   
       }
    
       check = 1;
    
       /* Check ascending sort */
       if(direction == 0) {
          for(i=1; i<NUM_FLOATS; i++) {
             if(data[i] < data[i-1]) {
                check = 0;
                break;
             }
          }
       }
       /* Check descending sort */
       if(direction == -1) {
          for(i=1; i<NUM_FLOATS; i++) {
             if(data[i] > data[i-1]) {
                check = 0;
                break;
             }
          }
       }
    
       /* Display check result */
       printf("Local size: %zu
    ", local_size);
       printf("Global size: %zu
    ", global_size);
       if(check)
          printf("Bitonic sort succeeded.
    ");
       else
          printf("Bitonic sort failed.
    ");
    
       /* Deallocate resources */
       clReleaseMemObject(data_buffer);
       clReleaseKernel(kernel_init);
       clReleaseKernel(kernel_stage_0);
       clReleaseKernel(kernel_stage_n);
       clReleaseKernel(kernel_merge);
       clReleaseKernel(kernel_merge_last);
       clReleaseCommandQueue(queue);
       clReleaseProgram(program);
       clReleaseContext(context);
       return 0;
    }

    核函数:

    /* Sort elements within a vector */
    #define VECTOR_SORT(input, dir)                                   
       comp = input < shuffle(input, mask2) ^ dir;                    
       input = shuffle(input, as_uint4(comp * 2 + add2));             
       comp = input < shuffle(input, mask1) ^ dir;                    
       input = shuffle(input, as_uint4(comp + add1));                 
    
    #define VECTOR_SWAP(input1, input2, dir)                          
       temp = input1;                                                 
       comp = (input1 < input2 ^ dir) * 4 + add3;                     
       input1 = shuffle2(input1, input2, as_uint4(comp));             
       input2 = shuffle2(input2, temp, as_uint4(comp));               
    
    /* Perform initial sort */
    __kernel void bsort_init(__global float4 *g_data, __local float4 *l_data) {
    
       int dir;
       uint id, global_start, size, stride;
       float4 input1, input2, temp;
       int4 comp;
    
       uint4 mask1 = (uint4)(1, 0, 3, 2);
       uint4 mask2 = (uint4)(2, 3, 0, 1);
       uint4 mask3 = (uint4)(3, 2, 1, 0);
    
       int4 add1 = (int4)(1, 1, 3, 3);
       int4 add2 = (int4)(2, 3, 2, 3);
       int4 add3 = (int4)(1, 2, 2, 3);
    
       id = get_local_id(0) * 2;
       global_start = get_group_id(0) * get_local_size(0) * 2 + id;
    
       input1 = g_data[global_start]; 
       input2 = g_data[global_start+1];
    
       /* Sort input 1 - ascending */
       comp = input1 < shuffle(input1, mask1);
       input1 = shuffle(input1, as_uint4(comp + add1));
       comp = input1 < shuffle(input1, mask2);
       input1 = shuffle(input1, as_uint4(comp * 2 + add2));
       comp = input1 < shuffle(input1, mask3);
       input1 = shuffle(input1, as_uint4(comp + add3));
    
       /* Sort input 2 - descending */
       comp = input2 > shuffle(input2, mask1);
       input2 = shuffle(input2, as_uint4(comp + add1));
       comp = input2 > shuffle(input2, mask2);
       input2 = shuffle(input2, as_uint4(comp * 2 + add2));
       comp = input2 > shuffle(input2, mask3);
       input2 = shuffle(input2, as_uint4(comp + add3));     
    
       /* Swap corresponding elements of input 1 and 2 */
       add3 = (int4)(4, 5, 6, 7);
       dir = get_local_id(0) % 2 * -1;
       temp = input1;
       comp = (input1 < input2 ^ dir) * 4 + add3;
       input1 = shuffle2(input1, input2, as_uint4(comp));
       input2 = shuffle2(input2, temp, as_uint4(comp));
    
       /* Sort data and store in local memory */
       VECTOR_SORT(input1, dir);
       VECTOR_SORT(input2, dir);
       l_data[id] = input1;
       l_data[id+1] = input2;
    
       /* Create bitonic set */
       for(size = 2; size < get_local_size(0); size <<= 1) {
          dir = (get_local_id(0)/size & 1) * -1;
    
          for(stride = size; stride > 1; stride >>= 1) {
             barrier(CLK_LOCAL_MEM_FENCE);
             id = get_local_id(0) + (get_local_id(0)/stride)*stride;
             VECTOR_SWAP(l_data[id], l_data[id + stride], dir)
          }
    
          barrier(CLK_LOCAL_MEM_FENCE);
          id = get_local_id(0) * 2;
          input1 = l_data[id]; input2 = l_data[id+1];
          temp = input1;
          comp = (input1 < input2 ^ dir) * 4 + add3;
          input1 = shuffle2(input1, input2, as_uint4(comp));
          input2 = shuffle2(input2, temp, as_uint4(comp));
          VECTOR_SORT(input1, dir);
          VECTOR_SORT(input2, dir);
          l_data[id] = input1;
          l_data[id+1] = input2;
       }
    
       /* Perform bitonic merge */
       dir = (get_group_id(0) % 2) * -1;
       for(stride = get_local_size(0); stride > 1; stride >>= 1) {
          barrier(CLK_LOCAL_MEM_FENCE);
          id = get_local_id(0) + (get_local_id(0)/stride)*stride;
          VECTOR_SWAP(l_data[id], l_data[id + stride], dir)
       }
       barrier(CLK_LOCAL_MEM_FENCE);
    
       /* Perform final sort */
       id = get_local_id(0) * 2;
       input1 = l_data[id]; input2 = l_data[id+1];
       temp = input1;
       comp = (input1 < input2 ^ dir) * 4 + add3;
       input1 = shuffle2(input1, input2, as_uint4(comp));
       input2 = shuffle2(input2, temp, as_uint4(comp));
       VECTOR_SORT(input1, dir);
       VECTOR_SORT(input2, dir);
       g_data[global_start] = input1;
       g_data[global_start+1] = input2;
    }
    
    /* Perform lowest stage of the bitonic sort */
    __kernel void bsort_stage_0(__global float4 *g_data, __local float4 *l_data, 
                                uint high_stage) {
    
       int dir;
       uint id, global_start, stride;
       float4 input1, input2, temp;
       int4 comp;
    
       uint4 mask1 = (uint4)(1, 0, 3, 2);
       uint4 mask2 = (uint4)(2, 3, 0, 1);
       uint4 mask3 = (uint4)(3, 2, 1, 0);
    
       int4 add1 = (int4)(1, 1, 3, 3);
       int4 add2 = (int4)(2, 3, 2, 3);
       int4 add3 = (int4)(4, 5, 6, 7);
    
       /* Determine data location in global memory */
       id = get_local_id(0);
       dir = (get_group_id(0)/high_stage & 1) * -1;
       global_start = get_group_id(0) * get_local_size(0) * 2 + id;
    
       /* Perform initial swap */
       input1 = g_data[global_start];
       input2 = g_data[global_start + get_local_size(0)];
       comp = (input1 < input2 ^ dir) * 4 + add3;
       l_data[id] = shuffle2(input1, input2, as_uint4(comp));
       l_data[id + get_local_size(0)] = shuffle2(input2, input1, as_uint4(comp));
    
       /* Perform bitonic merge */
       for(stride = get_local_size(0)/2; stride > 1; stride >>= 1) {
          barrier(CLK_LOCAL_MEM_FENCE);
          id = get_local_id(0) + (get_local_id(0)/stride)*stride;
          VECTOR_SWAP(l_data[id], l_data[id + stride], dir)
       }
       barrier(CLK_LOCAL_MEM_FENCE);
    
       /* Perform final sort */
       id = get_local_id(0) * 2;
       input1 = l_data[id]; input2 = l_data[id+1];
       temp = input1;
       comp = (input1 < input2 ^ dir) * 4 + add3;
       input1 = shuffle2(input1, input2, as_uint4(comp));
       input2 = shuffle2(input2, temp, as_uint4(comp));
       VECTOR_SORT(input1, dir);
       VECTOR_SORT(input2, dir);
    
       /* Store output in global memory */
       g_data[global_start + get_local_id(0)] = input1;
       g_data[global_start + get_local_id(0) + 1] = input2;
    }
    
    /* Perform successive stages of the bitonic sort */
    __kernel void bsort_stage_n(__global float4 *g_data, __local float4 *l_data, 
                                uint stage, uint high_stage) {
    
       int dir;
       float4 input1, input2;
       int4 comp, add;
       uint global_start, global_offset;
    
       add = (int4)(4, 5, 6, 7);
    
       /* Determine location of data in global memory */
       dir = (get_group_id(0)/high_stage & 1) * -1;
       global_start = (get_group_id(0) + (get_group_id(0)/stage)*stage) *
                       get_local_size(0) + get_local_id(0);
       global_offset = stage * get_local_size(0);
    
       /* Perform swap */
       input1 = g_data[global_start];
       input2 = g_data[global_start + global_offset];
       comp = (input1 < input2 ^ dir) * 4 + add;
       g_data[global_start] = shuffle2(input1, input2, as_uint4(comp));
       g_data[global_start + global_offset] = shuffle2(input2, input1, as_uint4(comp));
    }
    
    /* Sort the bitonic set */
    __kernel void bsort_merge(__global float4 *g_data, __local float4 *l_data, uint stage, int dir) {
    
       float4 input1, input2;
       int4 comp, add;
       uint global_start, global_offset;
    
       add = (int4)(4, 5, 6, 7);
    
       /* Determine location of data in global memory */
       global_start = (get_group_id(0) + (get_group_id(0)/stage)*stage) *
                       get_local_size(0) + get_local_id(0);
       global_offset = stage * get_local_size(0);
    
       /* Perform swap */
       input1 = g_data[global_start];
       input2 = g_data[global_start + global_offset];
       comp = (input1 < input2 ^ dir) * 4 + add;
       g_data[global_start] = shuffle2(input1, input2, as_uint4(comp));
       g_data[global_start + global_offset] = shuffle2(input2, input1, as_uint4(comp));
    }
    
    /* Perform final step of the bitonic merge */
    __kernel void bsort_merge_last(__global float4 *g_data, __local float4 *l_data, int dir) {
    
       uint id, global_start, stride;
       float4 input1, input2, temp;
       int4 comp;
    
       uint4 mask1 = (uint4)(1, 0, 3, 2);
       uint4 mask2 = (uint4)(2, 3, 0, 1);
       uint4 mask3 = (uint4)(3, 2, 1, 0);
    
       int4 add1 = (int4)(1, 1, 3, 3);
       int4 add2 = (int4)(2, 3, 2, 3);
       int4 add3 = (int4)(4, 5, 6, 7);
    
       /* Determine location of data in global memory */
       id = get_local_id(0);
       global_start = get_group_id(0) * get_local_size(0) * 2 + id;
    
       /* Perform initial swap */
       input1 = g_data[global_start];
       input2 = g_data[global_start + get_local_size(0)];
       comp = (input1 < input2 ^ dir) * 4 + add3;
       l_data[id] = shuffle2(input1, input2, as_uint4(comp));
       l_data[id + get_local_size(0)] = shuffle2(input2, input1, as_uint4(comp));
    
       /* Perform bitonic merge */
       for(stride = get_local_size(0)/2; stride > 1; stride >>= 1) {
          barrier(CLK_LOCAL_MEM_FENCE);
          id = get_local_id(0) + (get_local_id(0)/stride)*stride;
          VECTOR_SWAP(l_data[id], l_data[id + stride], dir)
       }
       barrier(CLK_LOCAL_MEM_FENCE);
    
       /* Perform final sort */
       id = get_local_id(0) * 2;
       input1 = l_data[id]; input2 = l_data[id+1];
       temp = input1;
       comp = (input1 < input2 ^ dir) * 4 + add3;
       input1 = shuffle2(input1, input2, as_uint4(comp));
       input2 = shuffle2(input2, temp, as_uint4(comp));
       VECTOR_SORT(input1, dir);
       VECTOR_SORT(input2, dir);
    
       /* Store the result to global memory */
       g_data[global_start + get_local_id(0)] = input1;
       g_data[global_start + get_local_id(0) + 1] = input2;
    }
    /* Sort elements within a vector */
    #define VECTOR_SORT(input, dir)                                   
       comp = input < shuffle(input, mask2) ^ dir;                    
       input = shuffle(input, as_uint4(comp * 2 + add2));             
       comp = input < shuffle(input, mask1) ^ dir;                    
       input = shuffle(input, as_uint4(comp + add1));                 
    
    #define VECTOR_SWAP(input1, input2, dir)                          
       temp = input1;                                                 
       comp = (input1 < input2 ^ dir) * 4 + add3;                     
       input1 = shuffle2(input1, input2, as_uint4(comp));             
       input2 = shuffle2(input2, temp, as_uint4(comp));               
  • 相关阅读:
    WebStrom
    设计模式之6大原则
    tortoiseSVN 合并代码方法
    SpannableString属性详解
    TortoiseSVN设置比较工具为BeyondCompare
    Android 扩大view点击范围
    activity 与 fragment生命周期
    记录一个 spring cloud 配置中心的坑,命令行端口参数无效,被覆盖,编码集问题无法读取文件等.
    spring boot admin + spring boot actuator + erueka 微服务监控
    spring boot actuator 简单使用
  • 原文地址:https://www.cnblogs.com/feihu-h/p/12107714.html
Copyright © 2020-2023  润新知