▶ 矩阵乘法,按照书里的内容进行了几方面的优化,包括局部内存,矢量数据类型,寄存器,流水线等。
● 最直接的乘法。调用时 main.c 中使用 size_t globalSize[2] = { rowA, colB }, localSize[2] = { 16, 16 }; 。rowA 蕴含在 get_global_id(0) 中了,不再出现在函数中,后面的几种方法也如此。
1 // multiply.cl 2 __kernel void multiply01(__global float *inputA, __global float *inputB, __global float *outputC, int colA, int colB) 3 { 4 const int row = get_global_id(0), col = get_global_id(1); 5 int k; 6 float sum; 7 for (k = 0, sum = 0.0f; k < colA; k++) 8 sum += inputA[row * colA + k] * inputB[k * colB + col]; 9 outputC[row * colB + col] = sum; 10 return; 11 }
1 // main.c 2 #include <stdio.h> 3 #include <math.h> 4 #include <stdlib.h> 5 #include <time.h> 6 #include <cl.h> 7 8 const int rowA = 4096, colA = 1024, colB = 2048; 9 //const int rowA = 128, colA = 128, colB = 128; // 测试用,刚够 multiply05 的 1 组 10 const char *sourceText = "D:\Code\OpenCL\multiply.cl"; 11 12 bool floatEq(const float a, const float b)// 相等返回 1 13 { 14 if (b == 0) 15 return fabs(a) < 0.001; 16 return fabs(a / b - 1) < 0.001; 17 } 18 19 int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度 20 { 21 FILE *fp; 22 int size; 23 //printf("<readText> File: %s ", kernelPath); 24 fopen_s(&fp, kernelPath, "rb"); 25 if (!fp) 26 { 27 printf("Open kernel file failed "); 28 getchar(); 29 exit(-1); 30 } 31 if (fseek(fp, 0, SEEK_END) != 0) 32 { 33 printf("Seek end of file failed "); 34 getchar(); 35 exit(-1); 36 } 37 if ((size = ftell(fp)) < 0) 38 { 39 printf("Get file position failed "); 40 getchar(); 41 exit(-1); 42 } 43 rewind(fp); 44 if ((*pcode = (char *)malloc(size + 1)) == NULL) 45 { 46 printf("Allocate space failed "); 47 getchar(); 48 exit(-1); 49 } 50 fread(*pcode, 1, size, fp); 51 (*pcode)[size] = '