忙了好久,期中考试考完了,要开始研究openCL了,这是自己的第一个hello word。就是数组的并行化,当然这里的kernal函数没有写在文件里,主要是程序不大,姑且就这样吧,便于阅读。以下是源代码
#include <iostream> #include <stdio.h> #include <stdlib.h> #include <CL/cl.h> const char* programSource= "__kernel \n" "void vecadd(__global const float* A, \n" "__global const float* B, \n" " __global float* C) \n" "{ \n" " int id = get_global_id(0); \n" " C[id] = A[id] + B[id]; \n" "} \n" ; int main() { int *A = NULL; // 输入数组 int *B = NULL; // 输入数组 int *C = NULL; // 输出数组 // 数组的大小 const int elements = 2048; // 计算内存大小 size_t datasize = sizeof(int)*elements; // 分配内存空间 A = (int*)malloc(datasize); B = (int*)malloc(datasize); C = (int*)malloc(datasize); // 初始化输入数组 for(int i = 0;i < elements;i++) { A[i] = std::rand(); B[i] = std::rand(); } // 获取并初始化平台 cl_int status; cl_uint numPlatforms = 0; cl_platform_id *platforms = NULL; status = clGetPlatformIDs(0,NULL,&numPlatforms); platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id)); // status = clGetPlatformIDs(numPlatforms,platforms,NULL); cl_uint numDevices = 0; cl_device_id *devices = NULL; status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL,0,NULL,&numDevices); // 分配内存空间 devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id)); status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL,numDevices,devices,NULL); cl_context context = NULL; //创建上下文,管理设备之间的资料 context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); cl_command_queue cmdQueue; //创建命令队列 cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status); //初始化数组内存 cl_mem bufferA; cl_mem bufferB; cl_mem bufferC; bufferA = clCreateBuffer( context, CL_MEM_READ_ONLY, datasize, NULL, &status); bufferB = clCreateBuffer( context, CL_MEM_READ_ONLY, datasize, NULL, &status); bufferC = clCreateBuffer( context, CL_MEM_WRITE_ONLY, datasize, NULL, &status); //将主机端的数据写入设备 status = clEnqueueWriteBuffer( cmdQueue, bufferA, CL_FALSE, 0, datasize, A, 0, NULL, NULL); status = clEnqueueWriteBuffer( cmdQueue, bufferB, CL_FALSE, 0, datasize, B, 0, NULL, NULL); //编译函数 cl_program program = clCreateProgramWithSource( context, 1, (const char**)&programSource, NULL, &status); status = clBuildProgram( program, numDevices, devices, NULL, NULL, NULL); //创建Kernel函数 cl_kernel kernel = NULL; kernel = clCreateKernel(program,"vecadd",&status); //设置参数 status = clSetKernelArg(kernel,0,sizeof(cl_mem),&bufferA); status = clSetKernelArg(kernel,1,sizeof(cl_mem),&bufferB); status = clSetKernelArg(kernel,2,sizeof(cl_mem),&bufferC); //初始化线程的映射 size_t globalWorkSize[1]; globalWorkSize[0] = elements; //运行kernel status = clEnqueueNDRangeKernel( cmdQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); //从设备中读回数据结果 clEnqueueReadBuffer( cmdQueue, bufferC, CL_TRUE, 0, datasize, C, 0, NULL, NULL); bool result = true; for(int i = 0;i < elements;i++) { //std::cout<<C[i]<<std::endl; if(C[i]!=A[i]+B[i]) { result = false; //break; } } if(result) { printf("Output is correct\n"); } else { printf("Output is incorrect\n"); } //清理数据 clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(bufferA); clReleaseMemObject(bufferB); clReleaseMemObject(bufferC); clReleaseContext(context); free(A); free(B); free(C); free(platforms); free(devices); return 0; }