I am trying to increase throughput by using AMD GPGPU and OpenCL, and I am getting less performance than a CPU using a loop. Also take note: this is my first openCL kernel so I understand if it is complete garbage.
Essentially I get large amounts of data, and for each of those data objects, I need to run some conversion, then compare to a 512*7 sized look-up table, and do a sum of squares of each of those 7 values (I will illustrate below). Currently I want to see the gain I can get with a less powerful GPGPU (Radeon 6570), with the hopes that any gain at all given by this will only get better as the GPGPU I select gets better.
Here is my Kernel code:
#define LUT_0_1_TABLE_ROW_SIZE 512 #define LUT_2_TABLE_COL_SIZE 12 #define LUT_2_TABLE_ROW_SIZE 207 #define LUT_2_TABLE_SIZE LUT_2_TABLE_COL_SIZE*LUT_2_TABLE_ROW_SIZE struct object { float m_fval0; float m_fval1; float m_fval2; float m_fval3; float m_fval4; float m_fval5; float m_fval6; float m_fval7; unsigned int m_uval0; unsigned int m_uval1; unsigned int m_uval2; unsigned int m_uval3; unsigned int m_uval4; }; ushort getLut2TableIndex(float fin0, float fin1) { float fMin; ushort tableIndex = (ushort)(fin1/7); // use tempIndex as a temp var.....Hacks!! ushort minIdx = min(max(tableIndex, (ushort)0), (ushort)7); tableIndex = (minIdx == 1 ? 2 : minIdx); // compensate for the fact that the "1" position doesn't exist tableIndex *= 26; tableIndex -= (minIdx < 2 ? 1 : 0); // compensation fMin = (fin0- minIdx); tableIndex += (ushort)fMin; // Make sure the idx is in a valid range - otherwise we can crash! tableIndex = min(max(minIdx, (ushort)0), (ushort)(LUT_1_TABLE_ROW_SIZE - 1)); // clamp returns the value inside the min and max // not using clamp because of using openCL 1.0 -> final release will use openCL 1.1.... return tableIndex; } __kernel void convertPdwsAndPerformAoa(global uint8 * input0, global uint8 * input1, global uint8 * input2, global uint8 * input3, global struct object * output, constant float * lut0, constant float4 * lut1, constant float * lut2, float fin0, local float4 * lut1Local) { float ftemp0 = -9.3f; float ftemp1 = 12.0f; int itemp0 = 0; float tempLog = 0.0f; float4 f4temp0 = { 0.0f, 0.0f, 0.0f, 0.0f }; float4 f4temp1 = { 0.0f, 0.0f, 0.0f, 0.0f }; float8 f8temp0 = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; float8 f8temp1 = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; uint8 input = input3[gid]; struct object tempOut; ushort lut2ColGroupIndex = 0; ushort lut2RowIdx; ushort lut0RowIdx = 0; float sumOfSquares; float minSum = MAXFLOAT; uint gid = get_global_id(0); uint lid = get_local_id(0); uint nloc = get_local_size(0); // Copy the lut1 global vector into a local vector for (unsigned int i = lid; i < LUT_0_1_TABLE_ROW_SIZE; i += nloc) { lut1Local[i] = lut1[i]; } barrier(CLK_LOCAL_MEM_FENCE); // Conversion code -> this is fast no need to look at! // ................. // ************************** lut2RowIdx = getLut2TableIndex(tempOut.m_fval0, fin0); f4temp0.s0 = f4temp0.s0/f4temp1.s0; f4temp0.s1 = f4temp0.s1/f4temp1.s1 + lut2[lut2RowIdx + 3*lut2ColGroupIndex]; f4temp0.s2 = f4temp0.s2/f4temp1.s2 + lut2[lut2RowIdx + 3*lut2ColGroupIndex + 1]; f4temp0.s3 = f4temp0.s3/f4temp1.s3 + lut2[lut2RowIdx + 3*lut2ColGroupIndex + 2]; f8temp0.s0 = f4temp0.s0 - f4temp0.s1; f8temp0.s1 = f4temp0.s0 - f4temp0.s2; f8temp0.s2 = f4temp0.s0 - f4temp0.s3; f8temp0.s3 = f4temp0.s1 - f4temp0.s2; f8temp0.s4 = f4temp0.s2 - f4temp0.s3; f8temp0.s5 = f4temp0.s1 - f4temp0.s3; f8temp0.s6 = f8temp0.s3 - f8temp0.s4; // Calculate the cost function for (unsigned short idx = 0; idx < LUT_0_1_TABLE_ROW_SIZE; ++idx) { // Calculate the lut1 components f8temp1.s0 = lut1Local[idx].s0*tempOut.m_fval0; f8temp1.s1 = lut1Local[idx].s1*tempOut.m_fval0; f8temp1.s2 = lut1Local[idx].s2*tempOut.m_fval0; f8temp1.s3 = f8temp1.s0 - f8temp1.s1; f8temp1.s4 = f8temp1.s0 - f8temp1.s2; f8temp1.s5 = f8temp1.s0 - f8temp1.s3; f8temp1.s6 = f8temp1.s3 - f8temp1.s4; f8temp0 -= f8temp1; f8temp1 += (f8temp1 > (float8)100.0f ? (float8)-5.0f : (float8)0.0f); // Square all elements of the vector f8temp1 *= f8temp1; // Sum all elements of the vector together sumOfSquares = f8temp1.s0; sumOfSquares += f8temp1.s1; sumOfSquares += f8temp1.s2; sumOfSquares += f8temp1.s3; sumOfSquares += f8temp1.s4; sumOfSquares += f8temp1.s5; sumOfSquares += f8temp1.s6; // If the new sum is < the old min, then this is the new min! Update the idx and the minSum. lut0RowIdx = (minSum < sumOfSquares ? lut0RowIdx : idx); minSum = fmin(minSum, sumOfSquares); } lut0RowIdx = min(max(lut0RowIdx, (ushort)0), (ushort)(LUT_0_1_TABLE_ROW_SIZE - 1)); // clamp tempOut.m_fin4 = lut0[lut0RowIdx]; output[gid] = tempOut; }
Anyway I obfuscated my code a bit, so there may be some things which don't make a lot of sense, but I need to do all those operations. One option I was looking at was have one kernel to do the conversion, then another for the cost function -> though the cpu would have to loop through and run the cost function kernel for each of the intended objects.
Thanks for taking a look!