GPGPU OpenCL程式設計步驟與簡單例項
1.OpenCL概念
OpenCL是一個為異構平臺編寫程式的框架,此異構平臺可由CPU、GPU或其他型別的處理器組成。OpenCL由一門用於編寫kernels (在OpenCL裝置上執行的函式)的語言(基於C99)和一組用於定義並控制平臺的API組成。
OpenCL提供了兩種層面的並行機制:任務並行與資料並行。
2.OpenCL與CUDA的區別
不同點:OpenCL是通用的異構平臺程式語言,為了兼顧不同裝置,使用繁瑣。
CUDA是nvidia公司發明的專門在其GPGPU上的程式設計的框架,使用簡單,好入門。
相同點:都是基於任務並行與資料並行。
3.OpenCL的程式設計步驟
(1)Discover and initialize the platforms
呼叫兩次clGetPlatformIDs函式,第一次獲取可用的平臺數量,第二次獲取一個可用的平臺。
(2)Discover and initialize the devices
呼叫兩次clGetDeviceIDs函式,第一次獲取可用的裝置數量,第二次獲取一個可用的裝置。
(3)Create a context(呼叫clCreateContext函式)
上下文context可能會管理多個裝置device。
(4)Create a command queue(呼叫clCreateCommandQueue函式)
一個裝置device對應一個command queue。
上下文conetxt將命令傳送到裝置對應的command queue,裝置就可以執行命令佇列裡的命令。
(5)Create device buffers(呼叫clCreateBuffer函式)
Buffer中儲存的是資料物件,就是裝置執行程式需要的資料儲存在其中。
Buffer由上下文conetxt建立,這樣上下文管理的多個裝置就會共享Buffer中的資料。
(6)Write host data to device buffers(呼叫clEnqueueWriteBuffer函式)
(7)Create and compile the program
建立程式物件,程式物件就代表你的程式原始檔或者二進位制程式碼資料。
(8)Create the kernel(呼叫clCreateKernel函式)
根據你的程式物件,生成kernel物件,表示裝置程式的入口。
(9)Set the kernel arguments(呼叫clSetKernelArg函式)
(10)Configure the work-item structure(設定worksize)
配置work-item的組織形式(維數,group組成等)
(11)Enqueue the kernel for execution(呼叫clEnqueueNDRangeKernel函式)
將kernel物件,以及 work-item引數放入命令佇列中進行執行。
(12)Read the output buffer back to the host(呼叫clEnqueueReadBuffer函式)
(13)Release OpenCL resources(至此結束整個執行過程)
4.說明
OpenCL中的核函式必須單列一個檔案。
OpenCL的程式設計一般步驟就是上面的13步,太長了,以至於要想做個向量加法都是那麼困難。
不過上面的步驟前3步一般是固定的,可以單獨寫在一個.h/.cpp檔案中,其他的一般也不會有什麼大的變化。
5.程式例項,向量運算
5.1通用前3個步驟,生成一個檔案
tool.h
1 #ifndef TOOLH 2 #define TOOLH 3 4 #include <CL/cl.h> 5 #include <string.h> 6 #include <stdio.h> 7 #include <stdlib.h> 8 #include <iostream> 9 #include <string> 10 #include <fstream> 11 using namespace std; 12 13 /** convert the kernel file into a string */ 14 int convertToString(const char *filename, std::string& s); 15 16 /**Getting platforms and choose an available one.*/ 17 int getPlatform(cl_platform_id &platform); 18 19 /**Step 2:Query the platform and choose the first GPU device if has one.*/ 20 cl_device_id *getCl_device_id(cl_platform_id &platform); 21 22 #endif
tool.cpp
1 #include <CL/cl.h> 2 #include <string.h> 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <iostream> 6 #include <string> 7 #include <fstream> 8 #include "tool.h" 9 using namespace std; 10 11 /** convert the kernel file into a string */ 12 int convertToString(const char *filename, std::string& s) 13 { 14 size_t size; 15 char* str; 16 std::fstream f(filename, (std::fstream::in | std::fstream::binary)); 17 18 if(f.is_open()) 19 { 20 size_t fileSize; 21 f.seekg(0, std::fstream::end); 22 size = fileSize = (size_t)f.tellg(); 23 f.seekg(0, std::fstream::beg); 24 str = new char[size+1]; 25 if(!str) 26 { 27 f.close(); 28 return 0; 29 } 30 31 f.read(str, fileSize); 32 f.close(); 33 str[size] = '\0'; 34 s = str; 35 delete[] str; 36 return 0; 37 } 38 cout<<"Error: failed to open file\n:"<<filename<<endl; 39 return -1; 40 } 41 42 /**Getting platforms and choose an available one.*/ 43 int getPlatform(cl_platform_id &platform) 44 { 45 platform = NULL;//the chosen platform 46 47 cl_uint numPlatforms;//the NO. of platforms 48 cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); 49 if (status != CL_SUCCESS) 50 { 51 cout<<"Error: Getting platforms!"<<endl; 52 return -1; 53 } 54 55 /**For clarity, choose the first available platform. */ 56 if(numPlatforms > 0) 57 { 58 cl_platform_id* platforms = 59 (cl_platform_id* )malloc(numPlatforms* sizeof(cl_platform_id)); 60 status = clGetPlatformIDs(numPlatforms, platforms, NULL); 61 platform = platforms[0]; 62 free(platforms); 63 } 64 else 65 return -1; 66 } 67 68 /**Step 2:Query the platform and choose the first GPU device if has one.*/ 69 cl_device_id *getCl_device_id(cl_platform_id &platform) 70 { 71 cl_uint numDevices = 0; 72 cl_device_id *devices=NULL; 73 cl_int status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); 74 if (numDevices > 0) //GPU available. 75 { 76 devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); 77 status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); 78 } 79 return devices; 80 }
5.2核函式檔案
HelloWorld_Kernel.cl
1 __kernel void helloworld(__global double* in, __global double* out) 2 { 3 int num = get_global_id(0); 4 out[num] = in[num] / 2.4 *(in[num]/6) ; 5 }
5.3主函式檔案
HelloWorld.cpp
1 //For clarity,error checking has been omitted. 2 #include <CL/cl.h> 3 #include "tool.h" 4 #include <string.h> 5 #include <stdio.h> 6 #include <stdlib.h> 7 #include <iostream> 8 #include <string> 9 #include <fstream> 10 using namespace std; 11 12 int main(int argc, char* argv[]) 13 { 14 cl_int status; 15 /**Step 1: Getting platforms and choose an available one(first).*/ 16 cl_platform_id platform; 17 getPlatform(platform); 18 19 /**Step 2:Query the platform and choose the first GPU device if has one.*/ 20 cl_device_id *devices=getCl_device_id(platform); 21 22 /**Step 3: Create context.*/ 23 cl_context context = clCreateContext(NULL,1, devices,NULL,NULL,NULL); 24 25 /**Step 4: Creating command queue associate with the context.*/ 26 cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL); 27 28 /**Step 5: Create program object */ 29 const char *filename = "HelloWorld_Kernel.cl"; 30 string sourceStr; 31 status = convertToString(filename, sourceStr); 32 const char *source = sourceStr.c_str(); 33 size_t sourceSize[] = {strlen(source)}; 34 cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL); 35 36 /**Step 6: Build program. */ 37 status=clBuildProgram(program, 1,devices,NULL,NULL,NULL); 38 39 /**Step 7: Initial input,output for the host and create memory objects for the kernel*/ 40 const int NUM=512000; 41 double* input = new double[NUM]; 42 for(int i=0;i<NUM;i++) 43 input[i]=i; 44 double* output = new double[NUM]; 45 46 cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, (NUM) * sizeof(double),(void *) input, NULL); 47 cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY , NUM * sizeof(double), NULL, NULL); 48 49 /**Step 8: Create kernel object */ 50 cl_kernel kernel = clCreateKernel(program,"helloworld", NULL); 51 52 /**Step 9: Sets Kernel arguments.*/ 53 status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); 54 status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer); 55 56 /**Step 10: Running the kernel.*/ 57 size_t global_work_size[1] = {NUM}; 58 cl_event enentPoint; 59 status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &enentPoint); 60 clWaitForEvents(1,&enentPoint); ///wait 61 clReleaseEvent(enentPoint); 62 63 /**Step 11: Read the cout put back to host memory.*/ 64 status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, NUM * sizeof(double), output, 0, NULL, NULL); 65 cout<<output[NUM-1]<<endl; 66 67 /**Step 12: Clean the resources.*/ 68 status = clReleaseKernel(kernel);//*Release kernel. 69 status = clReleaseProgram(program); //Release the program object. 70 status = clReleaseMemObject(inputBuffer);//Release mem object. 71 status = clReleaseMemObject(outputBuffer); 72 status = clReleaseCommandQueue(commandQueue);//Release Command queue. 73 status = clReleaseContext(context);//Release context. 74 75 if (output != NULL) 76 { 77 free(output); 78 output = NULL; 79 } 80 81 if (devices != NULL) 82 { 83 free(devices); 84 devices = NULL; 85 } 86 return 0; 87 }
編譯、連結、執行:
$> g++ -I /opt/AMDAPP/include/ -o A *.cpp -lOpenCL
$> ./A