diff options
Diffstat (limited to 'test_conformance/basic/test_async_copy3D.cpp')
-rw-r--r-- | test_conformance/basic/test_async_copy3D.cpp | 331 |
1 files changed, 162 insertions, 169 deletions
diff --git a/test_conformance/basic/test_async_copy3D.cpp b/test_conformance/basic/test_async_copy3D.cpp index 252159bc..5eb41ebc 100644 --- a/test_conformance/basic/test_async_copy3D.cpp +++ b/test_conformance/basic/test_async_copy3D.cpp @@ -25,96 +25,95 @@ #include "../../test_common/harness/conversions.h" #include "procs.h" -static const char *async_global_to_local_kernel3D = - "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n" - "%s\n" // optional pragma string - "__kernel void test_fn( const __global %s *src, __global %s *dst, __local " - "%s *localBuffer, int numElementsPerLine, int numLines, int " - "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, " - "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n" - "{\n" - " int i, j, k;\n" - // Zero the local storage first - " for(i=0; i<planesCopiesPerWorkItem; i++)\n" - " for(j=0; j<numLines; j++)\n" - " for(k=0; k<numElementsPerLine; k++)\n" - " localBuffer[ (get_local_id( 0 " - ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + " - "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + " - "dstLineStride) + k ] = (%s)(%s)0;\n" - // Do this to verify all kernels are done zeroing the local buffer before we - // try the copy - " barrier( CLK_LOCAL_MEM_FENCE );\n" - " event_t event;\n" - " event = async_work_group_copy_3D3D( (__local %s*)localBuffer, " - "(__global const " - "%s*)(src+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*" - "numElementsPerLine + numLines*srcLineStride + srcPlaneStride)), " - "(size_t)numElementsPerLine, (size_t)numLines, srcLineStride, " - "dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, dstPlaneStride, " - "0 );\n" - // Wait for the copy to complete, then verify by manually copying to the - // dest - " wait_group_events( 1, &event );\n" - " for(i=0; i<planesCopiesPerWorkItem; i++)\n" - " for(j=0; j<numLines; j++)\n" - " for(k=0; k<numElementsPerLine; k++)\n" - " dst[ (get_global_id( 0 " - ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + " - "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + " - "dstLineStride) + k ] = localBuffer[ (get_local_id( 0 " - ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + " - "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + " - "dstLineStride) + k ];\n" - "}\n"; - -static const char *async_local_to_global_kernel3D = - "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n" - "%s\n" // optional pragma string - "__kernel void test_fn( const __global %s *src, __global %s *dst, __local " - "%s *localBuffer, int numElementsPerLine, int numLines, int " - "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, " - "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n" - "{\n" - " int i, j, k;\n" - // Zero the local storage first - " for(i=0; i<planesCopiesPerWorkItem; i++)\n" - " for(j=0; j<numLines; j++)\n" - " for(k=0; k<numElementsPerLine; k++)\n" - " localBuffer[ (get_local_id( 0 " - ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + " - "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + " - "srcLineStride) + k ] = (%s)(%s)0;\n" - // Do this to verify all kernels are done zeroing the local buffer before we - // try the copy - " barrier( CLK_LOCAL_MEM_FENCE );\n" - " for(i=0; i<planesCopiesPerWorkItem; i++)\n" - " for(j=0; j<numLines; j++)\n" - " for(k=0; k<numElementsPerLine; k++)\n" - " localBuffer[ (get_local_id( 0 " - ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + " - "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + " - "srcLineStride) + k ] = src[ (get_global_id( 0 " - ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + " - "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + " - "srcLineStride) + k ];\n" - // Do this to verify all kernels are done copying to the local buffer before - // we try the copy - " barrier( CLK_LOCAL_MEM_FENCE );\n" - " event_t event;\n" - " event = async_work_group_copy_3D3D((__global " - "%s*)(dst+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*" - "numElementsPerLine + numLines*dstLineStride + dstPlaneStride)), (__local " - "const %s*)localBuffer, (size_t)numElementsPerLine, (size_t)numLines, " - "srcLineStride, dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, " - "dstPlaneStride, 0 );\n" - " wait_group_events( 1, &event );\n" - "}\n"; +static const char *async_global_to_local_kernel3D = R"OpenCLC( +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable +%s // optional pragma string + +__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer, + int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup, + int planesCopiesPerWorkItem, int srcLineStride, + int dstLineStride, int srcPlaneStride, int dstPlaneStride ) { + // Zero the local storage first + for (int i = 0; i < planesCopiesPerWorkItem; i++) { + for (int j = 0; j < numLines; j++) { + for (int k = 0; k < numElementsPerLine; k++) { + const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k; + localBuffer[index] = (%s)(%s)0; + } + } + } + + // Do this to verify all kernels are done zeroing the local buffer before we try the copy + barrier(CLK_LOCAL_MEM_FENCE); + + event_t event = async_work_group_copy_3D3D(localBuffer, 0, src, + planesCopiesPerWorkgroup * get_group_id(0) * srcPlaneStride, + sizeof(%s), (size_t)numElementsPerLine, (size_t)numLines, + planesCopiesPerWorkgroup, srcLineStride, srcPlaneStride, dstLineStride, + dstPlaneStride, 0); + + // Wait for the copy to complete, then verify by manually copying to the dest + wait_group_events(1, &event); + + for (int i = 0; i < planesCopiesPerWorkItem; i++) { + for (int j = 0; j < numLines; j++) { + for(int k = 0; k < numElementsPerLine; k++) { + const int local_index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k; + const int global_index = (get_global_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k; + dst[global_index] = localBuffer[local_index]; + } + } + } +} +)OpenCLC"; + +static const char *async_local_to_global_kernel3D = R"OpenCLC( +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable +%s // optional pragma string + +__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer, + int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup, + int planesCopiesPerWorkItem, int srcLineStride, + int dstLineStride, int srcPlaneStride, int dstPlaneStride) { + // Zero the local storage first + for (int i = 0; i < planesCopiesPerWorkItem; i++) { + for (int j = 0; j < numLines; j++) { + for (int k = 0; k < numElementsPerLine; k++) { + const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k; + localBuffer[index] = (%s)(%s)0; + } + } + } + + // Do this to verify all kernels are done zeroing the local buffer before we try the copy + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i=0; i < planesCopiesPerWorkItem; i++) { + for (int j=0; j < numLines; j++) { + for (int k=0; k < numElementsPerLine; k++) { + const int local_index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k; + const int global_index = (get_global_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j*srcLineStride + k; + localBuffer[local_index] = src[global_index]; + } + } + } + + // Do this to verify all kernels are done copying to the local buffer before we try the copy + barrier(CLK_LOCAL_MEM_FENCE); + + event_t event = async_work_group_copy_3D3D(dst, + planesCopiesPerWorkgroup * get_group_id(0) * dstPlaneStride, localBuffer, 0, + sizeof(%s), (size_t)numElementsPerLine, (size_t)numLines, planesCopiesPerWorkgroup, + srcLineStride, srcPlaneStride, dstLineStride, dstPlaneStride, 0); + + wait_group_events(1, &event); +} +)OpenCLC"; int test_copy3D(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *kernelCode, - ExplicitType vecType, int vecSize, int srcLineStride, - int dstLineStride, int srcPlaneStride, int dstPlaneStride, + ExplicitType vecType, int vecSize, int srcLineMargin, + int dstLineMargin, int srcPlaneMargin, int dstPlaneMargin, bool localIsDst) { int error; @@ -133,10 +132,10 @@ int test_copy3D(cl_device_id deviceID, cl_context context, vecSize); size_t elementSize = get_explicit_type_size(vecType) * vecSize; - log_info("Testing %s with srcLineStride = %d, dstLineStride = %d, " - "srcPlaneStride = %d, dstPlaneStride = %d\n", - vecNameString, srcLineStride, dstLineStride, srcPlaneStride, - dstPlaneStride); + log_info("Testing %s with srcLineMargin = %d, dstLineMargin = %d, " + "srcPlaneMargin = %d, dstPlaneMargin = %d\n", + vecNameString, srcLineMargin, dstLineMargin, srcPlaneMargin, + dstPlaneMargin); cl_long max_local_mem_size; error = @@ -201,16 +200,20 @@ int test_copy3D(cl_device_id deviceID, cl_context context, if (max_workgroup_size > max_local_workgroup_size[0]) max_workgroup_size = max_local_workgroup_size[0]; - size_t numElementsPerLine = 10; - size_t numLines = 13; - size_t planesCopiesPerWorkItem = 2; + const size_t numElementsPerLine = 10; + const cl_int dstLineStride = numElementsPerLine + dstLineMargin; + const cl_int srcLineStride = numElementsPerLine + srcLineMargin; + + const size_t numLines = 13; + const cl_int dstPlaneStride = (numLines * dstLineStride) + dstPlaneMargin; + const cl_int srcPlaneStride = (numLines * srcLineStride) + srcPlaneMargin; + elementSize = get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize); - size_t localStorageSpacePerWorkitem = elementSize - * (planesCopiesPerWorkItem - * (numLines * numElementsPerLine - + numLines * (localIsDst ? dstLineStride : srcLineStride) - + (localIsDst ? dstPlaneStride : srcPlaneStride))); + const size_t planesCopiesPerWorkItem = 2; + const size_t localStorageSpacePerWorkitem = elementSize + * planesCopiesPerWorkItem + * (localIsDst ? dstPlaneStride : srcPlaneStride); size_t maxLocalWorkgroupSize = (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem); @@ -224,42 +227,41 @@ int test_copy3D(cl_device_id deviceID, cl_context context, if (maxLocalWorkgroupSize > max_workgroup_size) localWorkgroupSize = max_workgroup_size; - size_t maxTotalPlanesIn = ((max_alloc_size / elementSize) + srcPlaneStride) - / ((numLines * numElementsPerLine + numLines * srcLineStride) - + srcPlaneStride); - size_t maxTotalPlanesOut = ((max_alloc_size / elementSize) + dstPlaneStride) - / ((numLines * numElementsPerLine + numLines * dstLineStride) - + dstPlaneStride); - size_t maxTotalPlanes = (std::min)(maxTotalPlanesIn, maxTotalPlanesOut); - size_t maxLocalWorkgroups = + const size_t maxTotalPlanesIn = + ((max_alloc_size / elementSize) + srcPlaneMargin) / srcPlaneStride; + const size_t maxTotalPlanesOut = + ((max_alloc_size / elementSize) + dstPlaneMargin) / dstPlaneStride; + const size_t maxTotalPlanes = std::min(maxTotalPlanesIn, maxTotalPlanesOut); + const size_t maxLocalWorkgroups = maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem); - size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem - - (localIsDst ? dstPlaneStride : srcPlaneStride); - size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups); - size_t totalPlanes = + const size_t localBufferSize = + localWorkgroupSize * localStorageSpacePerWorkitem + - (localIsDst ? dstPlaneMargin : srcPlaneMargin); + const size_t numberOfLocalWorkgroups = + std::min(1111, (int)maxLocalWorkgroups); + const size_t totalPlanes = numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem; - size_t inBufferSize = elementSize - * (totalPlanes - * (numLines * numElementsPerLine + numLines * srcLineStride) - + (totalPlanes - 1) * srcPlaneStride); - size_t outBufferSize = elementSize - * (totalPlanes - * (numLines * numElementsPerLine + numLines * dstLineStride) - + (totalPlanes - 1) * dstPlaneStride); - size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize; + const size_t inBufferSize = elementSize + * (totalPlanes * numLines * srcLineStride + + (totalPlanes - 1) * srcPlaneMargin); + const size_t outBufferSize = elementSize + * (totalPlanes * numLines * dstLineStride + + (totalPlanes - 1) * dstPlaneMargin); + const size_t globalWorkgroupSize = + numberOfLocalWorkgroups * localWorkgroupSize; inBuffer = (void *)malloc(inBufferSize); outBuffer = (void *)malloc(outBufferSize); outBufferCopy = (void *)malloc(outBufferSize); - cl_int planesCopiesPerWorkItemInt, numElementsPerLineInt, numLinesInt, - planesCopiesPerWorkgroup; - planesCopiesPerWorkItemInt = (int)planesCopiesPerWorkItem; - numElementsPerLineInt = (int)numElementsPerLine; - numLinesInt = (int)numLines; - planesCopiesPerWorkgroup = - (int)(planesCopiesPerWorkItem * localWorkgroupSize); + const cl_int planesCopiesPerWorkItemInt = + static_cast<cl_int>(planesCopiesPerWorkItem); + const cl_int numElementsPerLineInt = + static_cast<cl_int>(numElementsPerLine); + const cl_int numLinesInt = static_cast<cl_int>(numLines); + const cl_int planesCopiesPerWorkgroup = + static_cast<cl_int>(planesCopiesPerWorkItem * localWorkgroupSize); log_info("Global: %d, local %d, local buffer %db, global in buffer %db, " "global out buffer %db, each work group will copy %d planes and " @@ -336,14 +338,8 @@ int test_copy3D(cl_device_id deviceID, cl_context context, for (int k = 0; k < (int)numElementsPerLine * elementSize; k += elementSize) { - int inIdx = i - * (numLines * numElementsPerLine - + numLines * srcLineStride + srcPlaneStride) - + j * (numElementsPerLine + srcLineStride) + k; - int outIdx = i - * (numLines * numElementsPerLine - + numLines * dstLineStride + dstPlaneStride) - + j * (numElementsPerLine + dstLineStride) + k; + int inIdx = i * srcPlaneStride + j * srcLineStride + k; + int outIdx = i * dstPlaneStride + j * dstLineStride + k; if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx, typeSize) != 0) @@ -378,14 +374,11 @@ int test_copy3D(cl_device_id deviceID, cl_context context, } if (j < (int)numLines * elementSize) { - int outIdx = i - * (numLines * numElementsPerLine - + numLines * dstLineStride + dstPlaneStride) - + j * (numElementsPerLine + dstLineStride) + int outIdx = i * dstPlaneStride + j * dstLineStride + numElementsPerLine * elementSize; if (memcmp(((char *)outBuffer) + outIdx, ((char *)outBufferCopy) + outIdx, - dstLineStride * elementSize) + dstLineMargin * elementSize) != 0) { if (failuresPrinted == 0) @@ -409,14 +402,11 @@ int test_copy3D(cl_device_id deviceID, cl_context context, if (i < (int)(globalWorkgroupSize * planesCopiesPerWorkItem - 1) * elementSize) { - int outIdx = i - * (numLines * numElementsPerLine + numLines * dstLineStride - + dstPlaneStride) - + (numLines * elementSize) * (numElementsPerLine) - + (numLines * elementSize) * (dstLineStride); + int outIdx = + i * dstPlaneStride + numLines * dstLineStride * elementSize; if (memcmp(((char *)outBuffer) + outIdx, ((char *)outBufferCopy) + outIdx, - dstPlaneStride * elementSize) + dstPlaneMargin * elementSize) != 0) { if (failuresPrinted == 0) @@ -453,10 +443,13 @@ int test_copy3D_all_types(cl_device_id deviceID, cl_context context, kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes }; + // The margins below represent the number of elements between the end of + // one line or plane and the start of the next. The strides are equivalent + // to the size of the line or plane plus the chosen margin. unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 }; - unsigned int smallTypesStrideSizes[] = { 0, 10, 100 }; - unsigned int size, typeIndex, srcLineStride, dstLineStride, srcPlaneStride, - dstPlaneStride; + unsigned int smallTypesMarginSizes[] = { 0, 10, 100 }; + unsigned int size, typeIndex, srcLineMargin, dstLineMargin, srcPlaneMargin, + dstPlaneMargin; int errors = 0; @@ -482,33 +475,33 @@ int test_copy3D_all_types(cl_device_id deviceID, cl_context context, if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size] <= 2) // small type { - for (srcLineStride = 0; - srcLineStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - srcLineStride++) + for (srcLineMargin = 0; + srcLineMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + srcLineMargin++) { - for (dstLineStride = 0; - dstLineStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - dstLineStride++) + for (dstLineMargin = 0; + dstLineMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + dstLineMargin++) { - for (srcPlaneStride = 0; - srcPlaneStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - srcPlaneStride++) + for (srcPlaneMargin = 0; + srcPlaneMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + srcPlaneMargin++) { - for (dstPlaneStride = 0; - dstPlaneStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - dstPlaneStride++) + for (dstPlaneMargin = 0; + dstPlaneMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + dstPlaneMargin++) { if (test_copy3D( deviceID, context, queue, kernelCode, vecType[typeIndex], vecSizes[size], - smallTypesStrideSizes[srcLineStride], - smallTypesStrideSizes[dstLineStride], - smallTypesStrideSizes[srcPlaneStride], - smallTypesStrideSizes[dstPlaneStride], + smallTypesMarginSizes[srcLineMargin], + smallTypesMarginSizes[dstLineMargin], + smallTypesMarginSizes[srcPlaneMargin], + smallTypesMarginSizes[dstPlaneMargin], localIsDst)) { errors++; |