摘自《OpenCL 编程指南》Page 29
HelloWorld.cpp 中的main()函数会实现或调用一组函数,完成以下操作:
(1) 在第一个可用平台上创建OpenCL上下文
(2)在第一个可用设备上创建命令队列
(3)加载一个内核文件(HelloWorld.cl)并将它构建到程序对象中
(4)为HelloWorld.cl 中的内核函数hello_kernel()创建一个内核对象
(5)为内核函数的参数(a,b, result)创建内存对象
(6)将待执行的内核排队
(7)将内核结果读回结果缓冲区
(1)、(2) 对应:initContext(),
clGetPlatformIDs()、clGetContextInfo()、clCreateContextFromType()、clCreateCommandQueue()
(3)对应: initPrograms()
clCreateProgramWithSource()、clBuildProgram()、
(4)对应:clCreateKernel (->kernels.push_back(QCLKernel())
(5)在程序中创建参数对象,调用 clCreateBuffer等创建内存对象,供内核执行
(6)clSetKernelArg()、clEnqueueNDRangeKernel()、(在数据集上分布内核)
(7)clEnqueueReadBuffer() 从内核中读回结果
#include <cl/cl.h> #include <iostream> #include <fstream> #include <sstream> using namespace std; const int ARRAY_SIZE = 10; cl_context createContext() { cl_int errNum; cl_uint numPlatforms; cl_platform_id firstPlatformId; cl_context context = NULL; // 1.select an OpenCL platform to run on errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms); if (errNum != CL_SUCCESS || numPlatforms <= 0) { cerr << "Failed to find any OpenCL platforms." << endl; return NULL; } // 2. create an OpenCL context on the platform cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties) firstPlatformId, 0 }; context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU, NULL, NULL, &errNum); if (errNum != CL_SUCCESS) { cout << "Could not create GPU context, trying CPU..." << endl; context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU, NULL, NULL, &errNum); if(errNum != CL_SUCCESS) { cerr << "Failed to create an OpenCL GPU or CPU context. "; return NULL; } } return context; } cl_command_queue createCommandQueue(cl_context context, cl_device_id *device) { cl_int errNum; cl_device_id *devices; cl_command_queue commandQueue = NULL; size_t deviceBufferSize = -1; // 1. get the size of the devices buffer errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize); if (errNum != CL_SUCCESS) { cerr << "Failed call to clGetContextInfo()"; return NULL; } if (deviceBufferSize <= 0) { cerr << "No devices available."; return NULL; } // 2. Allocate memory for the device buffer devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)]; errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL); if ( errNum != CL_SUCCESS) { cerr << "Failed to get device IDs"; return NULL; } // 3. In this example, we just choose the first available device. In a real program, you would // likely use all available devices or choose the highest performance device commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL); if (commandQueue == NULL) { cerr << "Failed to create commandQueue for device 0"; return NULL; } *device = devices[0]; delete []devices; return commandQueue; } cl_program createProgram(cl_context context, cl_device_id device, const char* fileName) { cl_int errNum; cl_program program; ifstream kernelFile(fileName, ios::in); if (!kernelFile.is_open()) { cerr << "Failed to open file for reading: " << fileName << endl; return NULL; } ostringstream oss; oss << kernelFile.rdbuf(); string srcStdStr = oss.str(); const char *srcStr = srcStdStr.c_str(); program = clCreateProgramWithSource(context, 1, (const char **) &srcStr, NULL, NULL); if (program == NULL) { cerr << "Failed to create CL program for from source."; return NULL; } errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (errNum != CL_SUCCESS) { // Detemine the reason for the error char buildLog[16384]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); cerr << "Error in kernel: " << endl; cerr << buildLog; clReleaseProgram(program); return NULL; } return program; } bool createMemObject(cl_context context, cl_mem memObjects[3], float *a, float *b) { memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ARRAY_SIZE, a, NULL); memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ARRAY_SIZE, b, NULL); memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * ARRAY_SIZE, NULL, NULL); if(memObjects[0] == NULL || memObjects[1] == NULL || memObjects[2] == NULL) { cerr << "Error creating memory objects." << endl; return false; } return true; } int main(int argc, char** argv) { cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; cl_mem memObjects[3] = {0, 0, 0}; cl_int errNum; // 1. Context context = createContext(); if (context == NULL) { cerr << "Failed to create OpenCL context." << endl; system("pause"); return 1; } // 2. Create a command-queue on the first device available on the created context commandQueue = createCommandQueue(context, &device); if(commandQueue == NULL) { // Cleanup(); system("pause"); return 1; } // 3. create OpenCL program from HelloWorld.cl kernel source program = createProgram(context, device , "HelloWorld.cl"); if (program == NULL) { //Cleanup system("pause"); return 1; } // 4. Create OpenCL kernel kernel = clCreateKernel(program, "hello_kernel", NULL); if(kernel == NULL) { cerr << "Failed to create kernel " << endl; // Cleanup(); system("pause"); return 1; } // 5. Create memory objects that will be used as arguments to kernel. float result[ARRAY_SIZE]; float a[ARRAY_SIZE]; float b[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++) { a[i] = i; b[i] = i * 2; } if (!createMemObject(context, memObjects, a, b)) { // Cleanup() system("pause"); return 1; } // 6. set the kernel arguments (result, a, b) errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]); if (errNum != CL_SUCCESS) { cerr << "Error setting kernel arguments." << endl; // Cleanup() system("pause"); return 1; } size_t globalWorkSize[1] = {ARRAY_SIZE}; size_t localWorkSize[1] = {1}; // 7. Queue the kernel up for execution across the array errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (errNum != CL_SUCCESS) { cerr << "Erro queuing kernel for execution." << endl; // Cleanup() system("pause"); return 1; } // 8. Read the output buffer back to the host errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE, 0, ARRAY_SIZE * sizeof(float), result, 0, NULL, NULL); if (errNum != CL_SUCCESS) { cerr << "Error reading result buffer." << endl; //Cleanup(); system("pause"); return 1; } // 9. Output the result buffer for (int i = 0; i < ARRAY_SIZE; i++) { cout << result[i] << " "; } cout << endl; cout << "Exectued successfully. " << endl; // Cleanup() system("pause"); return 0; }
注意: clCreateBuffer()执行时只创建了内存对象(initial()), 此时需立即执行 clEnqueueNDRangeKernel() 才能完成内存中的数据a,b,result到显存的拷贝,即 这两个命令需在同一个函数区域内。否则的话,需在clCreateBuffer()之后立即使用clEnqueueWriteBuffer() (即需要read()下),手动将内容写入显存。这样可以无需立即调用clEnqueueNDRangeKernel().