初入OpenCL,做个记录。
在Windows下开发OpenCL程序,必须先下载OpenCL的SDK,现在AMD,NVIDIA,Intel均提供各自的OpenCL库,基本是大同小异。安装好SDK后新建Win32控制台项目,然后需要配置下包含文件路径和库路径,具体见下图(我安装的Intel的SDK )。
1.其中那个包含Intel的路径就是包含cl.h文件的目录。
2.如图中那个Intel的lib目录
3.添加需要连接的静态库OpenCL.lib
配置完成后就可以开始写代码调试了,OpenCL的初始化还是很复杂的,和CUDA几行代码搞定完全没可比性,刚开始可能对流程不太熟悉,慢慢熟悉就好,当然也可以自己写个框架来做这些复杂的初始化工作。OpenCL的内核代码是即时编译的,代码中我为了方便没有从cl文件中读入Kernel代码,直接以字符串的形式定义了。
1 #include "stdafx.h" 2 3 #include <iostream> 4 #include <fstream> 5 #include <string.h> 6 #include <vector> 7 using namespace std; 8 9 #if defined(__APPLE__) || defined(__MACOSX) 10 #include <OpenCL/cl.hpp> 11 #else 12 #include <CL/cl.h> 13 #endif 14 15 #define KERNEL(...) #__VA_ARGS__ 16 17 #define ARRAY_X_LEN 16 18 #define ARRAY_Y_LEN 16 19 20 const char *kernelSourceCode = KERNEL( 21 __kernel void VecAdd(__global int *buffer1, __global int *buffer2, __global int *buffer3) 22 { 23 size_t idx = get_global_id(0); 24 size_t idy = get_global_id(1); 25 int dimX = get_global_size(0); 26 int dimY = get_global_size(1); 27 int id = idx + idy*dimX; 28 buffer3[id] = buffer1[id] + buffer2[id]; 29 }); 30 31 int main() 32 { 33 cl_int status = 0; 34 size_t deviceListSize; 35 cl_uint numPlatforms; 36 cl_platform_id platform = NULL; 37 status = clGetPlatformIDs(0, NULL, &numPlatforms); 38 if (status != CL_SUCCESS) 39 { 40 printf("获取平台数目失败"); 41 return EXIT_FAILURE; 42 } 43 if (numPlatforms >0) 44 { 45 cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id)); 46 status = clGetPlatformIDs(numPlatforms, platforms, NULL); 47 if (status != CL_SUCCESS) 48 { 49 printf("初始化平台失败"); 50 return -1; 51 } 52 for (unsigned int i = 0; i<numPlatforms; ++i) 53 { 54 char *vendor = (char*)malloc(100); 55 status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL); 56 platform = platforms[i]; 57 if (!strcmp(vendor, "NVIDIA Corporation")) 58 { 59 break; 60 } 61 } 62 delete platforms; 63 } 64 cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; 65 cl_context_properties* cprops = (NULL == platform) ? NULL : cps; 66 cl_context context = clCreateContextFromType(cprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); 67 if (status != CL_SUCCESS) 68 { 69 printf("创建上下文失败"); 70 return EXIT_FAILURE; 71 } 72 status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); 73 if (status != CL_SUCCESS) 74 { 75 printf("获取设备数目失败"); 76 return EXIT_FAILURE; 77 } 78 cl_device_id *devices = (cl_device_id *)malloc(deviceListSize); 79 if (devices == 0) 80 { 81 printf("为设备分配空间失败"); 82 return EXIT_FAILURE; 83 } 84 status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); 85 if (status != CL_SUCCESS) 86 { 87 printf("初始化设备失败"); 88 return EXIT_FAILURE; 89 } 90 91 size_t sourceSize[] = { strlen(kernelSourceCode) }; 92 cl_program program = clCreateProgramWithSource(context, 1, &kernelSourceCode, sourceSize, &status); 93 if (status != CL_SUCCESS) 94 { 95 printf("创建程序失败"); 96 return EXIT_FAILURE; 97 } 98 status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); 99 if (status != CL_SUCCESS) 100 { 101 printf("编译程序失败"); 102 return EXIT_FAILURE; 103 } 104 cl_kernel kernel = clCreateKernel(program, "VecAdd", &status); 105 if (status != CL_SUCCESS) 106 { 107 printf("创建内核失败"); 108 return EXIT_FAILURE; 109 } 110 cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, &status); 111 if (status != CL_SUCCESS) 112 { 113 printf("创建命令队列失败"); 114 return EXIT_FAILURE; 115 } 116 int arrayLenght = ARRAY_X_LEN*ARRAY_Y_LEN; 117 int arraySize = arrayLenght*sizeof(int); 118 119 int *hA = new int[arrayLenght]; 120 int *hB = new int[arrayLenght]; 121 int *hC = new int[arrayLenght]; 122 123 memset(hA, 0, arraySize); 124 memset(hB, 0, arraySize); 125 memset(hC, 0, arraySize); 126 127 for (int i = 0; i<arrayLenght; i++) 128 { 129 hA[i] = i; 130 hB[i] = i; 131 } 132 133 cl_mem dA = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, arraySize, NULL, &status); 134 if (status != CL_SUCCESS) 135 { 136 printf("创建内存对象失败"); 137 return EXIT_FAILURE; 138 } 139 cl_mem dB = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, arraySize, NULL, &status); 140 if (status != CL_SUCCESS) 141 { 142 printf("创建内存对象失败"); 143 return EXIT_FAILURE; 144 } 145 cl_mem dC = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, arraySize, NULL, &status); 146 if (status != CL_SUCCESS) 147 { 148 printf("创建内存对象失败"); 149 return EXIT_FAILURE; 150 } 151 status = clEnqueueWriteBuffer(commandQueue, dA, CL_TRUE, 0, arraySize, hA, 0, NULL, NULL); 152 if (status != CL_SUCCESS) 153 { 154 printf("输入值写入内存对象失败"); 155 return EXIT_FAILURE; 156 } 157 status = clEnqueueWriteBuffer(commandQueue, dB, CL_TRUE, 0, arraySize, hB, 0, NULL, NULL); 158 if (status != CL_SUCCESS) 159 { 160 printf("输入值写入内存对象失败"); 161 return EXIT_FAILURE; 162 } 163 status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&dA); 164 if (status != CL_SUCCESS) 165 { 166 printf("设置内核参数失败"); 167 return EXIT_FAILURE; 168 } 169 status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&dB); 170 if (status != CL_SUCCESS) 171 { 172 printf("设置内核参数失败"); 173 return EXIT_FAILURE; 174 } 175 status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&dC); 176 if (status != CL_SUCCESS) 177 { 178 printf("设置内核参数失败"); 179 return EXIT_FAILURE; 180 } 181 size_t globalThreads[] = { ARRAY_X_LEN, ARRAY_Y_LEN }; 182 size_t localThreads[] = { 4, 4 }; 183 status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); 184 if (status != CL_SUCCESS) 185 { 186 printf("将内核放入命令队列失败"); 187 return EXIT_FAILURE; 188 } 189 status = clFinish(commandQueue); 190 if (status != CL_SUCCESS) 191 { 192 printf("队列还没有完成"); 193 return EXIT_FAILURE; 194 } 195 status = clEnqueueReadBuffer(commandQueue, dC, CL_TRUE, 0, arraySize, hC, 0, NULL, NULL); 196 if (status != CL_SUCCESS) 197 { 198 printf("读内存对象失败"); 199 return EXIT_FAILURE; 200 } 201 printf("结果:\n"); 202 for (int i = 0; i<arrayLenght; i++) 203 { 204 printf("%d ", hC[i]); 205 if ((i + 1) % ARRAY_X_LEN == 0) 206 printf("\n"); 207 } 208 status = clReleaseKernel(kernel); 209 status = clReleaseProgram(program); 210 status = clReleaseMemObject(dA); 211 status = clReleaseMemObject(dB); 212 status = clReleaseMemObject(dC); 213 status = clReleaseCommandQueue(commandQueue); 214 status = clReleaseContext(context); 215 free(devices); 216 delete [] hA; 217 delete [] hB; 218 delete [] hC; 219 return 0; 220 }
运行结果: