I have an opencl kernel which reads from an opengl texture (the texture is a render target attached to a framebuffer object). Once the texture is rendered to, I need to run my kernel to calculate another texture for the next render step where both will be used together. Everything works correctly, I read results back that look correct, but it kills my performance. I was hoping for performance that would rival that of GLSL shaders, Im wondering if Im doing something wrong.
Kernel
const char* source =
"#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n"
"\n"
"__constant float4 kToLum = (float4)(0.299f, 0.587f, 0.114f, 0.0f); \n"
"\n"
"__kernel void Lum(__read_only image2d_t readImage2D, __global uchar* lumHist)\n"
"{\n"
" const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; \n"
" int ix_s = get_global_id(0); \n"
" int iy_t = get_global_id(1); \n"
" int2 ixy_st = (int2)(ix_s, iy_t); \n"
"\n"
" float4 color = read_imagef(readImage2D, sampler, ixy_st); \n"
" float lum = dot(color, kToLum) * 255.0f; \n"
" uchar index = convert_uchar_rte(lum); \n"
"\n"
" int imgHeight = get_image_height(readImage2D); \n"
" iy_t = (imgHeight - 1) - iy_t; \n"
"\n"
" int imgWidth = get_image_width(readImage2D); \n"
" lumHist[ix_s + (iy_t * imgWidth)] = index; \n"
"}\n";
Setup
void _CreateBuffer(oclLumBuffer_t& oclb, const oclrt_t& oclrt)
{
cl_int error = 0;
GLint mipLevel = 0;
cl_mem_flags memFlags = CL_MEM_READ_ONLY;
oclb.readBufferIn_0 = clCreateFromGLTexture2D(oclrt.context, memFlags, GL_TEXTURE_2D, mipLevel, oclb.tex2DID, &error);
int bufferLen = oclb.texWidth * oclb.texHeight;
oclb.bufferSize = bufferLen * sizeof(oclLumBuffer_t::bufferType_t);
oclb.indexBuffer = new oclLumBuffer_t::bufferType_t[bufferLen];
MemClr(&oclb.indexBuffer[0], oclb.bufferSize);
memFlags = CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR; // write to app memory
oclb.writeBufferOut_0 = clCreateBuffer(oclrt.context, memFlags, oclb.bufferSize, &oclb.indexBuffer[0], &error);
}
void _SetBuffer(oclLumBuffer_t& oclb, const oclProgram_t& oclp)
{
cl_uint argIndex = 0;
clSetKernelArg(oclp.kernel, argIndex, sizeof(oclb.readBufferIn_0), &oclb.readBufferIn_0);
argIndex = 1;
clSetKernelArg(oclp.kernel, argIndex, sizeof(cl_mem), (void*)&oclb.writeBufferOut_0);
}
Run kernel
void _AquireGL(oclLumBuffer_t& oclb, const oclrt_t& oclrt)
{
cl_int error = 0;
cl_uint numObjs = 1;
cl_uint numEventsInWaitList = 0;
const cl_event* eventWaitList = NULL;
cl_event* event = NULL;
error = clEnqueueAcquireGLObjects(oclrt.deviceCmdQueue, numObjs, &oclb.readBufferIn_0, numEventsInWaitList, eventWaitList, event);
}
void _ReleaseGL(oclLumBuffer_t& oclb, const oclrt_t& oclrt)
{
cl_int error = 0;
cl_uint numObjs = 1;
cl_uint numEventsInWaitList = 0;
const cl_event* eventWaitList = NULL;
cl_event* event = NULL;
error = clEnqueueReleaseGLObjects(oclrt.deviceCmdQueue, numObjs, &oclb.readBufferIn_0, numEventsInWaitList, eventWaitList, event);
}
void _RunCompute(oclProgram_t& oclp, oclLumBuffer_t& oclb, const oclrt_t& oclrt)
{
///
//glFlush();
//glFinish();
///
_AquireGL(oclb, oclrt);
cl_uint workDim = 2;
const size_t* globalWorkOffset = NULL;
const size_t globalWorkSize[] = {oclb.texWidth, oclb.texHeight}; // of length "work_dim"
//const size_t localWorkSize[] = {16, 16};
const size_t* localWorkSize = NULL;
cl_uint numEventsInWaitList = 0;
const cl_event* eventWaitList = NULL;
cl_int error = clEnqueueNDRangeKernel(oclrt.deviceCmdQueue, oclp.kernel, workDim, globalWorkOffset, globalWorkSize, localWorkSize,
numEventsInWaitList, eventWaitList, &oclp.kernelEvent);
clFlush(oclrt.deviceCmdQueue);
{
cl_int status = CL_SUCCESS;
cl_int eventStatus = CL_QUEUED;
while(eventStatus != CL_COMPLETE) {
status = clGetEventInfo(oclp.kernelEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL);
}
status = clReleaseEvent(oclp.kernelEvent);
}
_ReleaseGL(oclb, oclrt);
clFlush(oclrt.deviceCmdQueue);
int bufferByteLen = oclb.bufferSize;
void* buffer = &oclb.indexBuffer[0];
cl_bool blockingRead = CL_TRUE;
size_t readOffset = 0;
error = clEnqueueReadBuffer(oclrt.deviceCmdQueue, oclb.writeBufferOut_0, blockingRead, readOffset, bufferByteLen, buffer, 0, NULL, NULL);
}
Sorry for not using the formatting , it lags and doesnt work very well.