aboutsummaryrefslogtreecommitdiff
path: root/test_conformance/basic/test_async_copy3D.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'test_conformance/basic/test_async_copy3D.cpp')
-rw-r--r--test_conformance/basic/test_async_copy3D.cpp331
1 files changed, 162 insertions, 169 deletions
diff --git a/test_conformance/basic/test_async_copy3D.cpp b/test_conformance/basic/test_async_copy3D.cpp
index 252159bc..5eb41ebc 100644
--- a/test_conformance/basic/test_async_copy3D.cpp
+++ b/test_conformance/basic/test_async_copy3D.cpp
@@ -25,96 +25,95 @@
#include "../../test_common/harness/conversions.h"
#include "procs.h"
-static const char *async_global_to_local_kernel3D =
- "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
- "%s\n" // optional pragma string
- "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
- "%s *localBuffer, int numElementsPerLine, int numLines, int "
- "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, "
- "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n"
- "{\n"
- " int i, j, k;\n"
- // Zero the local storage first
- " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
- " for(j=0; j<numLines; j++)\n"
- " for(k=0; k<numElementsPerLine; k++)\n"
- " localBuffer[ (get_local_id( 0 "
- ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
- "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
- "dstLineStride) + k ] = (%s)(%s)0;\n"
- // Do this to verify all kernels are done zeroing the local buffer before we
- // try the copy
- " barrier( CLK_LOCAL_MEM_FENCE );\n"
- " event_t event;\n"
- " event = async_work_group_copy_3D3D( (__local %s*)localBuffer, "
- "(__global const "
- "%s*)(src+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*"
- "numElementsPerLine + numLines*srcLineStride + srcPlaneStride)), "
- "(size_t)numElementsPerLine, (size_t)numLines, srcLineStride, "
- "dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, dstPlaneStride, "
- "0 );\n"
- // Wait for the copy to complete, then verify by manually copying to the
- // dest
- " wait_group_events( 1, &event );\n"
- " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
- " for(j=0; j<numLines; j++)\n"
- " for(k=0; k<numElementsPerLine; k++)\n"
- " dst[ (get_global_id( 0 "
- ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
- "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
- "dstLineStride) + k ] = localBuffer[ (get_local_id( 0 "
- ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
- "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
- "dstLineStride) + k ];\n"
- "}\n";
-
-static const char *async_local_to_global_kernel3D =
- "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
- "%s\n" // optional pragma string
- "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
- "%s *localBuffer, int numElementsPerLine, int numLines, int "
- "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, "
- "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n"
- "{\n"
- " int i, j, k;\n"
- // Zero the local storage first
- " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
- " for(j=0; j<numLines; j++)\n"
- " for(k=0; k<numElementsPerLine; k++)\n"
- " localBuffer[ (get_local_id( 0 "
- ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
- "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
- "srcLineStride) + k ] = (%s)(%s)0;\n"
- // Do this to verify all kernels are done zeroing the local buffer before we
- // try the copy
- " barrier( CLK_LOCAL_MEM_FENCE );\n"
- " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
- " for(j=0; j<numLines; j++)\n"
- " for(k=0; k<numElementsPerLine; k++)\n"
- " localBuffer[ (get_local_id( 0 "
- ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
- "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
- "srcLineStride) + k ] = src[ (get_global_id( 0 "
- ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
- "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
- "srcLineStride) + k ];\n"
- // Do this to verify all kernels are done copying to the local buffer before
- // we try the copy
- " barrier( CLK_LOCAL_MEM_FENCE );\n"
- " event_t event;\n"
- " event = async_work_group_copy_3D3D((__global "
- "%s*)(dst+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*"
- "numElementsPerLine + numLines*dstLineStride + dstPlaneStride)), (__local "
- "const %s*)localBuffer, (size_t)numElementsPerLine, (size_t)numLines, "
- "srcLineStride, dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, "
- "dstPlaneStride, 0 );\n"
- " wait_group_events( 1, &event );\n"
- "}\n";
+static const char *async_global_to_local_kernel3D = R"OpenCLC(
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+%s // optional pragma string
+
+__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer,
+ int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup,
+ int planesCopiesPerWorkItem, int srcLineStride,
+ int dstLineStride, int srcPlaneStride, int dstPlaneStride ) {
+ // Zero the local storage first
+ for (int i = 0; i < planesCopiesPerWorkItem; i++) {
+ for (int j = 0; j < numLines; j++) {
+ for (int k = 0; k < numElementsPerLine; k++) {
+ const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
+ localBuffer[index] = (%s)(%s)0;
+ }
+ }
+ }
+
+ // Do this to verify all kernels are done zeroing the local buffer before we try the copy
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ event_t event = async_work_group_copy_3D3D(localBuffer, 0, src,
+ planesCopiesPerWorkgroup * get_group_id(0) * srcPlaneStride,
+ sizeof(%s), (size_t)numElementsPerLine, (size_t)numLines,
+ planesCopiesPerWorkgroup, srcLineStride, srcPlaneStride, dstLineStride,
+ dstPlaneStride, 0);
+
+ // Wait for the copy to complete, then verify by manually copying to the dest
+ wait_group_events(1, &event);
+
+ for (int i = 0; i < planesCopiesPerWorkItem; i++) {
+ for (int j = 0; j < numLines; j++) {
+ for(int k = 0; k < numElementsPerLine; k++) {
+ const int local_index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
+ const int global_index = (get_global_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
+ dst[global_index] = localBuffer[local_index];
+ }
+ }
+ }
+}
+)OpenCLC";
+
+static const char *async_local_to_global_kernel3D = R"OpenCLC(
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+%s // optional pragma string
+
+__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer,
+ int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup,
+ int planesCopiesPerWorkItem, int srcLineStride,
+ int dstLineStride, int srcPlaneStride, int dstPlaneStride) {
+ // Zero the local storage first
+ for (int i = 0; i < planesCopiesPerWorkItem; i++) {
+ for (int j = 0; j < numLines; j++) {
+ for (int k = 0; k < numElementsPerLine; k++) {
+ const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k;
+ localBuffer[index] = (%s)(%s)0;
+ }
+ }
+ }
+
+ // Do this to verify all kernels are done zeroing the local buffer before we try the copy
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (int i=0; i < planesCopiesPerWorkItem; i++) {
+ for (int j=0; j < numLines; j++) {
+ for (int k=0; k < numElementsPerLine; k++) {
+ const int local_index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k;
+ const int global_index = (get_global_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j*srcLineStride + k;
+ localBuffer[local_index] = src[global_index];
+ }
+ }
+ }
+
+ // Do this to verify all kernels are done copying to the local buffer before we try the copy
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ event_t event = async_work_group_copy_3D3D(dst,
+ planesCopiesPerWorkgroup * get_group_id(0) * dstPlaneStride, localBuffer, 0,
+ sizeof(%s), (size_t)numElementsPerLine, (size_t)numLines, planesCopiesPerWorkgroup,
+ srcLineStride, srcPlaneStride, dstLineStride, dstPlaneStride, 0);
+
+ wait_group_events(1, &event);
+}
+)OpenCLC";
int test_copy3D(cl_device_id deviceID, cl_context context,
cl_command_queue queue, const char *kernelCode,
- ExplicitType vecType, int vecSize, int srcLineStride,
- int dstLineStride, int srcPlaneStride, int dstPlaneStride,
+ ExplicitType vecType, int vecSize, int srcLineMargin,
+ int dstLineMargin, int srcPlaneMargin, int dstPlaneMargin,
bool localIsDst)
{
int error;
@@ -133,10 +132,10 @@ int test_copy3D(cl_device_id deviceID, cl_context context,
vecSize);
size_t elementSize = get_explicit_type_size(vecType) * vecSize;
- log_info("Testing %s with srcLineStride = %d, dstLineStride = %d, "
- "srcPlaneStride = %d, dstPlaneStride = %d\n",
- vecNameString, srcLineStride, dstLineStride, srcPlaneStride,
- dstPlaneStride);
+ log_info("Testing %s with srcLineMargin = %d, dstLineMargin = %d, "
+ "srcPlaneMargin = %d, dstPlaneMargin = %d\n",
+ vecNameString, srcLineMargin, dstLineMargin, srcPlaneMargin,
+ dstPlaneMargin);
cl_long max_local_mem_size;
error =
@@ -201,16 +200,20 @@ int test_copy3D(cl_device_id deviceID, cl_context context,
if (max_workgroup_size > max_local_workgroup_size[0])
max_workgroup_size = max_local_workgroup_size[0];
- size_t numElementsPerLine = 10;
- size_t numLines = 13;
- size_t planesCopiesPerWorkItem = 2;
+ const size_t numElementsPerLine = 10;
+ const cl_int dstLineStride = numElementsPerLine + dstLineMargin;
+ const cl_int srcLineStride = numElementsPerLine + srcLineMargin;
+
+ const size_t numLines = 13;
+ const cl_int dstPlaneStride = (numLines * dstLineStride) + dstPlaneMargin;
+ const cl_int srcPlaneStride = (numLines * srcLineStride) + srcPlaneMargin;
+
elementSize =
get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
- size_t localStorageSpacePerWorkitem = elementSize
- * (planesCopiesPerWorkItem
- * (numLines * numElementsPerLine
- + numLines * (localIsDst ? dstLineStride : srcLineStride)
- + (localIsDst ? dstPlaneStride : srcPlaneStride)));
+ const size_t planesCopiesPerWorkItem = 2;
+ const size_t localStorageSpacePerWorkitem = elementSize
+ * planesCopiesPerWorkItem
+ * (localIsDst ? dstPlaneStride : srcPlaneStride);
size_t maxLocalWorkgroupSize =
(((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
@@ -224,42 +227,41 @@ int test_copy3D(cl_device_id deviceID, cl_context context,
if (maxLocalWorkgroupSize > max_workgroup_size)
localWorkgroupSize = max_workgroup_size;
- size_t maxTotalPlanesIn = ((max_alloc_size / elementSize) + srcPlaneStride)
- / ((numLines * numElementsPerLine + numLines * srcLineStride)
- + srcPlaneStride);
- size_t maxTotalPlanesOut = ((max_alloc_size / elementSize) + dstPlaneStride)
- / ((numLines * numElementsPerLine + numLines * dstLineStride)
- + dstPlaneStride);
- size_t maxTotalPlanes = (std::min)(maxTotalPlanesIn, maxTotalPlanesOut);
- size_t maxLocalWorkgroups =
+ const size_t maxTotalPlanesIn =
+ ((max_alloc_size / elementSize) + srcPlaneMargin) / srcPlaneStride;
+ const size_t maxTotalPlanesOut =
+ ((max_alloc_size / elementSize) + dstPlaneMargin) / dstPlaneStride;
+ const size_t maxTotalPlanes = std::min(maxTotalPlanesIn, maxTotalPlanesOut);
+ const size_t maxLocalWorkgroups =
maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem);
- size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem
- - (localIsDst ? dstPlaneStride : srcPlaneStride);
- size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups);
- size_t totalPlanes =
+ const size_t localBufferSize =
+ localWorkgroupSize * localStorageSpacePerWorkitem
+ - (localIsDst ? dstPlaneMargin : srcPlaneMargin);
+ const size_t numberOfLocalWorkgroups =
+ std::min(1111, (int)maxLocalWorkgroups);
+ const size_t totalPlanes =
numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem;
- size_t inBufferSize = elementSize
- * (totalPlanes
- * (numLines * numElementsPerLine + numLines * srcLineStride)
- + (totalPlanes - 1) * srcPlaneStride);
- size_t outBufferSize = elementSize
- * (totalPlanes
- * (numLines * numElementsPerLine + numLines * dstLineStride)
- + (totalPlanes - 1) * dstPlaneStride);
- size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
+ const size_t inBufferSize = elementSize
+ * (totalPlanes * numLines * srcLineStride
+ + (totalPlanes - 1) * srcPlaneMargin);
+ const size_t outBufferSize = elementSize
+ * (totalPlanes * numLines * dstLineStride
+ + (totalPlanes - 1) * dstPlaneMargin);
+ const size_t globalWorkgroupSize =
+ numberOfLocalWorkgroups * localWorkgroupSize;
inBuffer = (void *)malloc(inBufferSize);
outBuffer = (void *)malloc(outBufferSize);
outBufferCopy = (void *)malloc(outBufferSize);
- cl_int planesCopiesPerWorkItemInt, numElementsPerLineInt, numLinesInt,
- planesCopiesPerWorkgroup;
- planesCopiesPerWorkItemInt = (int)planesCopiesPerWorkItem;
- numElementsPerLineInt = (int)numElementsPerLine;
- numLinesInt = (int)numLines;
- planesCopiesPerWorkgroup =
- (int)(planesCopiesPerWorkItem * localWorkgroupSize);
+ const cl_int planesCopiesPerWorkItemInt =
+ static_cast<cl_int>(planesCopiesPerWorkItem);
+ const cl_int numElementsPerLineInt =
+ static_cast<cl_int>(numElementsPerLine);
+ const cl_int numLinesInt = static_cast<cl_int>(numLines);
+ const cl_int planesCopiesPerWorkgroup =
+ static_cast<cl_int>(planesCopiesPerWorkItem * localWorkgroupSize);
log_info("Global: %d, local %d, local buffer %db, global in buffer %db, "
"global out buffer %db, each work group will copy %d planes and "
@@ -336,14 +338,8 @@ int test_copy3D(cl_device_id deviceID, cl_context context,
for (int k = 0; k < (int)numElementsPerLine * elementSize;
k += elementSize)
{
- int inIdx = i
- * (numLines * numElementsPerLine
- + numLines * srcLineStride + srcPlaneStride)
- + j * (numElementsPerLine + srcLineStride) + k;
- int outIdx = i
- * (numLines * numElementsPerLine
- + numLines * dstLineStride + dstPlaneStride)
- + j * (numElementsPerLine + dstLineStride) + k;
+ int inIdx = i * srcPlaneStride + j * srcLineStride + k;
+ int outIdx = i * dstPlaneStride + j * dstLineStride + k;
if (memcmp(((char *)inBuffer) + inIdx,
((char *)outBuffer) + outIdx, typeSize)
!= 0)
@@ -378,14 +374,11 @@ int test_copy3D(cl_device_id deviceID, cl_context context,
}
if (j < (int)numLines * elementSize)
{
- int outIdx = i
- * (numLines * numElementsPerLine
- + numLines * dstLineStride + dstPlaneStride)
- + j * (numElementsPerLine + dstLineStride)
+ int outIdx = i * dstPlaneStride + j * dstLineStride
+ numElementsPerLine * elementSize;
if (memcmp(((char *)outBuffer) + outIdx,
((char *)outBufferCopy) + outIdx,
- dstLineStride * elementSize)
+ dstLineMargin * elementSize)
!= 0)
{
if (failuresPrinted == 0)
@@ -409,14 +402,11 @@ int test_copy3D(cl_device_id deviceID, cl_context context,
if (i < (int)(globalWorkgroupSize * planesCopiesPerWorkItem - 1)
* elementSize)
{
- int outIdx = i
- * (numLines * numElementsPerLine + numLines * dstLineStride
- + dstPlaneStride)
- + (numLines * elementSize) * (numElementsPerLine)
- + (numLines * elementSize) * (dstLineStride);
+ int outIdx =
+ i * dstPlaneStride + numLines * dstLineStride * elementSize;
if (memcmp(((char *)outBuffer) + outIdx,
((char *)outBufferCopy) + outIdx,
- dstPlaneStride * elementSize)
+ dstPlaneMargin * elementSize)
!= 0)
{
if (failuresPrinted == 0)
@@ -453,10 +443,13 @@ int test_copy3D_all_types(cl_device_id deviceID, cl_context context,
kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong,
kULong, kFloat, kDouble, kNumExplicitTypes
};
+ // The margins below represent the number of elements between the end of
+ // one line or plane and the start of the next. The strides are equivalent
+ // to the size of the line or plane plus the chosen margin.
unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
- unsigned int smallTypesStrideSizes[] = { 0, 10, 100 };
- unsigned int size, typeIndex, srcLineStride, dstLineStride, srcPlaneStride,
- dstPlaneStride;
+ unsigned int smallTypesMarginSizes[] = { 0, 10, 100 };
+ unsigned int size, typeIndex, srcLineMargin, dstLineMargin, srcPlaneMargin,
+ dstPlaneMargin;
int errors = 0;
@@ -482,33 +475,33 @@ int test_copy3D_all_types(cl_device_id deviceID, cl_context context,
if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size]
<= 2) // small type
{
- for (srcLineStride = 0;
- srcLineStride < sizeof(smallTypesStrideSizes)
- / sizeof(smallTypesStrideSizes[0]);
- srcLineStride++)
+ for (srcLineMargin = 0;
+ srcLineMargin < sizeof(smallTypesMarginSizes)
+ / sizeof(smallTypesMarginSizes[0]);
+ srcLineMargin++)
{
- for (dstLineStride = 0;
- dstLineStride < sizeof(smallTypesStrideSizes)
- / sizeof(smallTypesStrideSizes[0]);
- dstLineStride++)
+ for (dstLineMargin = 0;
+ dstLineMargin < sizeof(smallTypesMarginSizes)
+ / sizeof(smallTypesMarginSizes[0]);
+ dstLineMargin++)
{
- for (srcPlaneStride = 0;
- srcPlaneStride < sizeof(smallTypesStrideSizes)
- / sizeof(smallTypesStrideSizes[0]);
- srcPlaneStride++)
+ for (srcPlaneMargin = 0;
+ srcPlaneMargin < sizeof(smallTypesMarginSizes)
+ / sizeof(smallTypesMarginSizes[0]);
+ srcPlaneMargin++)
{
- for (dstPlaneStride = 0;
- dstPlaneStride < sizeof(smallTypesStrideSizes)
- / sizeof(smallTypesStrideSizes[0]);
- dstPlaneStride++)
+ for (dstPlaneMargin = 0;
+ dstPlaneMargin < sizeof(smallTypesMarginSizes)
+ / sizeof(smallTypesMarginSizes[0]);
+ dstPlaneMargin++)
{
if (test_copy3D(
deviceID, context, queue, kernelCode,
vecType[typeIndex], vecSizes[size],
- smallTypesStrideSizes[srcLineStride],
- smallTypesStrideSizes[dstLineStride],
- smallTypesStrideSizes[srcPlaneStride],
- smallTypesStrideSizes[dstPlaneStride],
+ smallTypesMarginSizes[srcLineMargin],
+ smallTypesMarginSizes[dstLineMargin],
+ smallTypesMarginSizes[srcPlaneMargin],
+ smallTypesMarginSizes[dstPlaneMargin],
localIsDst))
{
errors++;