Any hope of this running better?

I have an opencl kernel which reads from an opengl texture (the texture is a render target attached to a framebuffer object). Once the texture is rendered to, I need to run my kernel to calculate another texture for the next render step where both will be used together. Everything works correctly, I read results back that look correct, but it kills my performance. I was hoping for performance that would rival that of GLSL shaders, Im wondering if Im doing something wrong.

Kernel

    const char* source =
    "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n"
    "\n"
    "__constant float4 kToLum = (float4)(0.299f, 0.587f, 0.114f, 0.0f); \n"
    "\n"
    "__kernel void Lum(__read_only image2d_t readImage2D, __global uchar* lumHist)\n"
    "{\n"
    "   const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; \n"
    "   int ix_s = get_global_id(0); \n"
    "   int iy_t = get_global_id(1); \n"
    "   int2 ixy_st = (int2)(ix_s, iy_t); \n"
    "\n"
    "   float4 color = read_imagef(readImage2D, sampler, ixy_st); \n"
    "   float lum = dot(color, kToLum) * 255.0f; \n"
    "   uchar index = convert_uchar_rte(lum); \n"
    "\n"
    "   int imgHeight = get_image_height(readImage2D); \n"
    "   iy_t = (imgHeight - 1) - iy_t; \n"
    "\n"
    "   int imgWidth = get_image_width(readImage2D); \n"
    "   lumHist[ix_s + (iy_t * imgWidth)] = index; \n"
    "}\n";

Setup

void _CreateBuffer(oclLumBuffer_t& oclb, const oclrt_t& oclrt)

{

cl_int error = 0;

GLint mipLevel = 0;

cl_mem_flags memFlags = CL_MEM_READ_ONLY;

oclb.readBufferIn_0 = clCreateFromGLTexture2D(oclrt.context, memFlags, GL_TEXTURE_2D, mipLevel, oclb.tex2DID, &error);

int bufferLen = oclb.texWidth * oclb.texHeight;

oclb.bufferSize = bufferLen * sizeof(oclLumBuffer_t::bufferType_t);

oclb.indexBuffer = new oclLumBuffer_t::bufferType_t[bufferLen];

MemClr(&oclb.indexBuffer[0], oclb.bufferSize);

memFlags = CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR; // write to app memory

oclb.writeBufferOut_0 = clCreateBuffer(oclrt.context, memFlags, oclb.bufferSize, &oclb.indexBuffer[0], &error);

}

void _SetBuffer(oclLumBuffer_t& oclb, const oclProgram_t& oclp)

{

cl_uint argIndex = 0;

clSetKernelArg(oclp.kernel, argIndex, sizeof(oclb.readBufferIn_0), &oclb.readBufferIn_0);

argIndex = 1;

clSetKernelArg(oclp.kernel, argIndex, sizeof(cl_mem), (void*)&oclb.writeBufferOut_0);

}

Run kernel

void _AquireGL(oclLumBuffer_t& oclb, const oclrt_t& oclrt)

{

cl_int error = 0;

cl_uint numObjs = 1;

cl_uint numEventsInWaitList = 0;

const cl_event* eventWaitList = NULL;

cl_event* event = NULL;

error = clEnqueueAcquireGLObjects(oclrt.deviceCmdQueue, numObjs, &oclb.readBufferIn_0, numEventsInWaitList, eventWaitList, event);

}

void _ReleaseGL(oclLumBuffer_t& oclb, const oclrt_t& oclrt)

{

cl_int error = 0;

cl_uint numObjs = 1;

cl_uint numEventsInWaitList = 0;

const cl_event* eventWaitList = NULL;

cl_event* event = NULL;

error = clEnqueueReleaseGLObjects(oclrt.deviceCmdQueue, numObjs, &oclb.readBufferIn_0, numEventsInWaitList, eventWaitList, event);

}

void _RunCompute(oclProgram_t& oclp, oclLumBuffer_t& oclb, const oclrt_t& oclrt)

{

///

//glFlush();

//glFinish();

///

_AquireGL(oclb, oclrt);

cl_uint workDim = 2;

const size_t* globalWorkOffset = NULL;

const size_t globalWorkSize[] = {oclb.texWidth, oclb.texHeight}; // of length "work_dim"

//const size_t localWorkSize[] = {16, 16};

const size_t* localWorkSize = NULL;

cl_uint numEventsInWaitList = 0;

const cl_event* eventWaitList = NULL;

cl_int error = clEnqueueNDRangeKernel(oclrt.deviceCmdQueue, oclp.kernel, workDim, globalWorkOffset, globalWorkSize, localWorkSize,

numEventsInWaitList, eventWaitList, &oclp.kernelEvent);

clFlush(oclrt.deviceCmdQueue);

{

cl_int status = CL_SUCCESS;

cl_int eventStatus = CL_QUEUED;

while(eventStatus != CL_COMPLETE) {

status = clGetEventInfo(oclp.kernelEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL);

}

status = clReleaseEvent(oclp.kernelEvent);

}

_ReleaseGL(oclb, oclrt);

clFlush(oclrt.deviceCmdQueue);

int bufferByteLen = oclb.bufferSize;

void* buffer = &oclb.indexBuffer[0];

cl_bool blockingRead = CL_TRUE;

size_t readOffset = 0;

error = clEnqueueReadBuffer(oclrt.deviceCmdQueue, oclb.writeBufferOut_0, blockingRead, readOffset, bufferByteLen, buffer, 0, NULL, NULL);

}

Sorry for not using the formatting , it lags and doesnt work very well.

Latest Images

Trending Articles

Latest Images