• opencl(十八)----矩阵转置、矩阵乘法


       矩阵转置

    //  kernel
    __kernel void transpose(__global float4 *g_mat, 
       __local float4 *l_mat, uint size) {
    
       __global float4 *src, *dst;
    
       /* Determine row and column location */
       int col = get_global_id(0);
       int row = 0;
       while(col >= size) {
          col -= size--;
          row++;
       }
       col += row;
       size += row;
    
       /* Read source block into local memory */
       src = g_mat + row * size * 4 + col;
       l_mat += get_local_id(0)*8;
       l_mat[0] = src[0];
       l_mat[1] = src[size];
       l_mat[2] = src[2*size];
       l_mat[3] = src[3*size];
    
       /* Process block on diagonal */
       if(row == col) {
          src[0] = 
             (float4)(l_mat[0].x, l_mat[1].x, l_mat[2].x, l_mat[3].x);
          src[size] = 
             (float4)(l_mat[0].y, l_mat[1].y, l_mat[2].y, l_mat[3].y);
          src[2*size] = 
             (float4)(l_mat[0].z, l_mat[1].z, l_mat[2].z, l_mat[3].z);
          src[3*size] = 
             (float4)(l_mat[0].w, l_mat[1].w, l_mat[2].w, l_mat[3].w);
       }
       /* Process block off diagonal */
       else {
          /* Read destination block into local memory */
          dst = g_mat + col * size * 4 + row;
          l_mat[4] = dst[0];
          l_mat[5] = dst[size];
          l_mat[6] = dst[2*size];
          l_mat[7] = dst[3*size];
    
          /* Set elements of destination block */
          dst[0] = 
             (float4)(l_mat[0].x, l_mat[1].x, l_mat[2].x, l_mat[3].x);
          dst[size] = 
             (float4)(l_mat[0].y, l_mat[1].y, l_mat[2].y, l_mat[3].y);
          dst[2*size] = 
             (float4)(l_mat[0].z, l_mat[1].z, l_mat[2].z, l_mat[3].z);
          dst[3*size] = 
             (float4)(l_mat[0].w, l_mat[1].w, l_mat[2].w, l_mat[3].w);
    
          /* Set elements of source block */
          src[0] = 
             (float4)(l_mat[4].x, l_mat[5].x, l_mat[6].x, l_mat[7].x);
          src[size] = 
             (float4)(l_mat[4].y, l_mat[5].y, l_mat[6].y, l_mat[7].y);
          src[2*size] = 
             (float4)(l_mat[4].z, l_mat[5].z, l_mat[6].z, l_mat[7].z);
          src[3*size] = 
             (float4)(l_mat[4].w, l_mat[5].w, l_mat[6].w, l_mat[7].w);
       }
    }

    // 主机程序

    #define _CRT_SECURE_NO_WARNINGS
    #define PROGRAM_FILE "transpose.cl"
    #define KERNEL_FUNC "transpose"
    
    #define MATRIX_DIM 64
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    #ifdef MAC
    #include <OpenCL/cl.h>
    #else
    #include <CL/cl.h>
    #endif
    
    /* Find a GPU or CPU associated with the first available platform */
    cl_device_id create_device() {
    
       cl_platform_id platform;
       cl_device_id dev;
       int err;
    
       /* Identify a platform */
       err = clGetPlatformIDs(1, &platform, NULL);
       if(err < 0) {
          perror("Couldn't identify a platform");
          exit(1);
       } 
    
       /* Access a device */
       err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
       if(err == CL_DEVICE_NOT_FOUND) {
          err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
       }
       if(err < 0) {
          perror("Couldn't access any devices");
          exit(1);   
       }
    
       return dev;
    }
    
    /* Create program from a file and compile it */
    cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
    
       cl_program program;
       FILE *program_handle;
       char *program_buffer, *program_log;
       size_t program_size, log_size;
       int err;
    
       /* Read program file and place content into buffer */
       program_handle = fopen(filename, "r");
       if(program_handle == NULL) {
          perror("Couldn't find the program file");
          exit(1);
       }
       fseek(program_handle, 0, SEEK_END);
       program_size = ftell(program_handle);
       rewind(program_handle);
       program_buffer = (char*)malloc(program_size + 1);
       program_buffer[program_size] = '';
       fread(program_buffer, sizeof(char), program_size, program_handle);
       fclose(program_handle);
    
       /* Create program from file */
       program = clCreateProgramWithSource(ctx, 1, 
          (const char**)&program_buffer, &program_size, &err);
       if(err < 0) {
          perror("Couldn't create the program");
          exit(1);
       }
       free(program_buffer);
    
       /* Build program */
       err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
       if(err < 0) {
    
          /* Find size of log and print to std output */
          clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
                0, NULL, &log_size);
          program_log = (char*) malloc(log_size + 1);
          program_log[log_size] = '';
          clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
                log_size + 1, program_log, NULL);
          printf("%s
    ", program_log);
          free(program_log);
          exit(1);
       }
       return program;
    }
    
    int main() {
    
       /* Host/device data structures */
       cl_device_id device;
       cl_context context;
       cl_command_queue queue;
       cl_program program;
       cl_kernel kernel;
       size_t global_size;
       cl_ulong mem_size;
       int i, j, err, check;
     
       /* Data and buffers */
       cl_uint matrix_dim;
       float data[MATRIX_DIM][MATRIX_DIM];
       cl_mem data_buffer;
    
       /* Initialize data */
       for(i=0; i<MATRIX_DIM; i++) {
          for(j=0; j<MATRIX_DIM; j++) {
             data[i][j] = 1.0f * i * MATRIX_DIM + j;
          }
       }
    
       /* Create a device and context */
       device = create_device();
       context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
       if(err < 0) {
          perror("Couldn't create a context");
          exit(1);   
       }
    
       /* Build the program */
       program = build_program(context, device, PROGRAM_FILE);
    
       /* Create a kernel */
       kernel = clCreateKernel(program, KERNEL_FUNC, &err);
       if(err < 0) {
          perror("Couldn't create a kernel");
          exit(1);
       };
    
       /* Create buffer to hold matrix */
       data_buffer = clCreateBuffer(context, 
             CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
             sizeof(data), data, &err);
       if(err < 0) {
          perror("Couldn't create a buffer");
          exit(1);   
       };
    
       /* Determine execution parameters */
       global_size = (MATRIX_DIM/4 * (MATRIX_DIM/4 + 1))/2;
       clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,     
             sizeof(mem_size), &mem_size, NULL);
    
       /* Create kernel arguments */
       matrix_dim = MATRIX_DIM/4;
       err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer);
       err |= clSetKernelArg(kernel, 1, (size_t)mem_size, NULL);
       err |= clSetKernelArg(kernel, 2, sizeof(matrix_dim), &matrix_dim);
       if(err < 0) {
          printf("Couldn't set a kernel argument");
          exit(1);   
       };
    
       /* Create a command queue */
       queue = clCreateCommandQueue(context, device, 0, &err);
       if(err < 0) {
          perror("Couldn't create a command queue");
          exit(1);   
       };
    
       /* Enqueue kernel */
       err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, 
             NULL, 0, NULL, NULL);
       if(err < 0) {
          perror("Couldn't enqueue the kernel");
          printf("Error: %d
    ", err);
          exit(1);   
       } 
    
       /* Read the result */
       err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, 
          sizeof(data), data, 0, NULL, NULL);
       if(err < 0) {
          perror("Couldn't read the buffer");
          exit(1);   
       } 
    
       /* Check data */
       check = 1;
       for(i=0; i<MATRIX_DIM; i++) {
          for(j=0; j<MATRIX_DIM; j++) {
             if(data[i][j] != 1.0*j*MATRIX_DIM+i) {
                check = 0;
                break;
             }
          }
       }
       if(check)
          printf("Transpose check succeeded.
    ");
       else
          printf("Transpose check failed.
    ");
    
       /* Deallocate resources */
       clReleaseMemObject(data_buffer);
       clReleaseKernel(kernel);
       clReleaseCommandQueue(queue);
       clReleaseProgram(program);
       clReleaseContext(context);
       return 0;
    }
  • 相关阅读:
    jquery获取网页在浏览器中可见宽度与高度
    mysql子查询
    ffmpeg 转码并截图的命令分析
    mysql find_in_set 与 in 的用法与区别,mysql范围搜索,mysql范围查询
    CSS一行显示,显示不下的用省略号显示
    js如何通过变量调用函数,函数名在变量里面
    jq自定义下拉菜单,在点击非当前下拉菜单区域时,关闭下拉菜单(点击事件的对象不是目标元素本身)
    解决JS浮点数(小数)计算加减乘除的BUG
    php裁剪图片
    百度蜘蛛ip段代表的不同含义
  • 原文地址:https://www.cnblogs.com/feihu-h/p/12107384.html
Copyright © 2020-2023  润新知