aboutsummaryrefslogtreecommitdiff
path: root/test_conformance/basic/test_vloadstore.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'test_conformance/basic/test_vloadstore.cpp')
-rw-r--r--test_conformance/basic/test_vloadstore.cpp800
1 files changed, 358 insertions, 442 deletions
diff --git a/test_conformance/basic/test_vloadstore.cpp b/test_conformance/basic/test_vloadstore.cpp
index e137f9e7..d34ecbf9 100644
--- a/test_conformance/basic/test_vloadstore.cpp
+++ b/test_conformance/basic/test_vloadstore.cpp
@@ -13,52 +13,129 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//
-#include "harness/compat.h"
-
+#include <algorithm>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/stat.h>
+#include <vector>
+#include <CL/cl_half.h>
#include "procs.h"
#include "harness/conversions.h"
-#include "harness/typeWrappers.h"
#include "harness/errorHelpers.h"
+#include "harness/stringHelpers.h"
+#include "harness/typeWrappers.h"
// Outputs debug information for stores
#define DEBUG 0
// Forces stores/loads to be done with offsets = tid
#define LINEAR_OFFSETS 0
#define NUM_LOADS 512
-
-static const char *doubleExtensionPragma = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+#define HFF(num) cl_half_from_float(num, halfRoundingMode)
+#define HTF(num) cl_half_to_float(num)
+
+char pragma_str[128] = { 0 };
+char mem_type[64] = { 0 };
+char store_str[128] = { 0 };
+char load_str[128] = { 0 };
+
+extern cl_half_rounding_mode halfRoundingMode;
+
+// clang-format off
+static const char *store_pattern= "results[ tid ] = tmp;\n";
+static const char *store_patternV3 = "results[3*tid] = tmp.s0; results[3*tid+1] = tmp.s1; results[3*tid+2] = tmp.s2;\n";
+static const char *load_pattern = "sSharedStorage[ i ] = src[ i ];\n";
+static const char *load_patternV3 = "sSharedStorage[3*i] = src[ 3*i]; sSharedStorage[3*i+1] = src[3*i+1]; sSharedStorage[3*i+2] = src[3*i+2];\n";
+static const char *kernel_pattern[] = {
+pragma_str,
+"#define STYPE %s\n"
+"__kernel void test_fn( ", mem_type, " STYPE *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
+"{\n"
+" int tid = get_global_id( 0 );\n"
+" %s%d tmp = vload%d( offsets[ tid ], ( (", mem_type, " STYPE *) src ) + alignmentOffsets[ tid ] );\n"
+" ", store_str,
+"}\n"
+};
+
+const char *pattern_local [] = {
+pragma_str,
+"__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
+"{\n"
+" int tid = get_global_id( 0 );\n"
+" int lid = get_local_id( 0 );\n"
+"\n"
+" if( lid == 0 )\n"
+" {\n"
+" for( int i = 0; i < %d; i++ ) {\n"
+" ", load_str,
+" }\n"
+" }\n"
+// Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
+// threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
+// updated on all threads at that point
+" barrier( CLK_LOCAL_MEM_FENCE );\n"
+"\n"
+" %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
+" ", store_str,
+"}\n" };
+
+const char *pattern_priv [] = {
+pragma_str,
+// Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
+// for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
+"#define PRIV_TYPE %s\n"
+"#define PRIV_SIZE %d\n"
+"__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
+"{\n"
+" __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
+" int tid = get_global_id( 0 );\n"
+"\n"
+" for( int i = 0; i < PRIV_SIZE; i++ )\n"
+" sPrivateStorage[ i ] = src[ i ];\n"
+// Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
+// anybody else to sync up
+"\n"
+" %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
+" ", store_str,
+"}\n"};
+// clang-format on
#pragma mark -------------------- vload harness --------------------------
-typedef void (*create_vload_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize );
+typedef void (*create_program_fn)(std::string &, size_t, ExplicitType, size_t,
+ size_t);
+typedef int (*test_fn)(cl_device_id, cl_context, cl_command_queue, ExplicitType,
+ unsigned int, create_program_fn, size_t);
-int test_vload( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
- create_vload_program_fn createFn, size_t bufferSize, MTdata d )
+int test_vload(cl_device_id device, cl_context context, cl_command_queue queue,
+ ExplicitType type, unsigned int vecSize,
+ create_program_fn createFn, size_t bufferSize)
{
- int error;
-
clProgramWrapper program;
clKernelWrapper kernel;
clMemWrapper streams[ 4 ];
+ MTdataHolder d(gRandomSeed);
const size_t numLoads = (DEBUG) ? 16 : NUM_LOADS;
if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128;
size_t threads[ 1 ], localThreads[ 1 ];
clProtectedArray inBuffer( bufferSize );
- char programSrc[ 10240 ];
cl_uint offsets[ numLoads ], alignmentOffsets[ numLoads ];
size_t numElements, typeSize, i;
unsigned int outVectorSize;
+ pragma_str[0] = '\0';
+ if (type == kDouble)
+ std::snprintf(pragma_str, sizeof(pragma_str),
+ "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
+ else if (type == kHalf)
+ std::snprintf(pragma_str, sizeof(pragma_str),
+ "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
typeSize = get_explicit_type_size( type );
numElements = bufferSize / ( typeSize * vecSize );
@@ -83,25 +160,19 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue,
outVectorSize = vecSize;
// Declare output buffers now
-#if !(defined(_WIN32) && defined(_MSC_VER))
- char outBuffer[ numLoads * typeSize * outVectorSize ];
- char referenceBuffer[ numLoads * typeSize * vecSize ];
-#else
- char* outBuffer = (char*)_malloca(numLoads * typeSize * outVectorSize * sizeof(cl_char));
- char* referenceBuffer = (char*)_malloca(numLoads * typeSize * vecSize * sizeof(cl_char));
-#endif
+ std::vector<char> outBuffer(numLoads * typeSize * outVectorSize);
+ std::vector<char> referenceBuffer(numLoads * typeSize * vecSize);
// Create the program
-
-
+ std::string programSrc;
createFn( programSrc, numElements, type, vecSize, outVectorSize);
// Create our kernel
- const char *ptr = programSrc;
-
- error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
+ const char *ptr = programSrc.c_str();
+ cl_int error = create_single_kernel_helper(context, &program, &kernel, 1,
+ &ptr, "test_fn");
test_error( error, "Unable to create testing kernel" );
- if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
+ if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str());
// Get the number of args to differentiate the kernels with local storage. (They have 5)
cl_uint numArgs;
@@ -115,7 +186,9 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue,
test_error( error, "Unable to create kernel stream" );
streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(alignmentOffsets[0]), alignmentOffsets, &error );
test_error( error, "Unable to create kernel stream" );
- streams[ 3 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*typeSize*outVectorSize, (void *)outBuffer, &error );
+ streams[3] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+ numLoads * typeSize * outVectorSize,
+ (void *)outBuffer.data(), &error);
test_error( error, "Unable to create kernel stream" );
// Set parameters and run
@@ -145,28 +218,32 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue,
test_error( error, "Unable to exec kernel" );
// Get the results
- error = clEnqueueReadBuffer( queue, streams[ 3 ], CL_TRUE, 0, numLoads * typeSize * outVectorSize * sizeof(cl_char), (void *)outBuffer, 0, NULL, NULL );
+ error = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0,
+ numLoads * typeSize * outVectorSize
+ * sizeof(cl_char),
+ (void *)outBuffer.data(), 0, NULL, NULL);
test_error( error, "Unable to read results" );
-
// Create the reference results
- memset( referenceBuffer, 0, numLoads * typeSize * vecSize * sizeof(cl_char));
+ referenceBuffer.assign(numLoads * typeSize * vecSize, 0);
for( i = 0; i < numLoads; i++ )
{
- memcpy( referenceBuffer + i * typeSize * vecSize, ( (char *)(void *)inBuffer ) + ( ( offsets[ i ] * vecSize ) + alignmentOffsets[ i ] ) * typeSize,
- typeSize * vecSize );
+ memcpy(&referenceBuffer[i * typeSize * vecSize],
+ ((char *)(void *)inBuffer)
+ + ((offsets[i] * vecSize) + alignmentOffsets[i]) * typeSize,
+ typeSize * vecSize);
}
// Validate the results now
- char *expected = referenceBuffer;
- char *actual = outBuffer;
+ char *expected = referenceBuffer.data();
+ char *actual = outBuffer.data();
char *in = (char *)(void *)inBuffer;
if (DEBUG) {
log_info("Memory contents:\n");
+ char inString[1024];
+ char expectedString[1024], actualString[1024];
for (i=0; i<numElements; i++) {
- char inString[1024];
- char expectedString[ 1024 ], actualString[ 1024 ];
if (i < numLoads) {
log_info("buffer %3d: input: %s expected: %s got: %s (load offset %3d, alignment offset %3d)", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
@@ -197,35 +274,42 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue,
expected += typeSize * vecSize;
actual += typeSize * outVectorSize;
}
-
return 0;
}
-int test_vloadset(cl_device_id device, cl_context context, cl_command_queue queue, create_vload_program_fn createFn, size_t bufferSize )
+template <test_fn test_func_ptr>
+int test_vset(cl_device_id device, cl_context context, cl_command_queue queue,
+ create_program_fn createFn, size_t bufferSize)
{
- ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
+ std::vector<ExplicitType> vecType = { kChar, kUChar, kShort, kUShort,
+ kInt, kUInt, kLong, kULong,
+ kFloat, kHalf, kDouble };
unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
const char *size_names[] = { "2", "3", "4", "8", "16"};
- unsigned int typeIdx, sizeIdx;
int error = 0;
- MTdata mtData = init_genrand( gRandomSeed );
log_info("Testing with buffer size of %d.\n", (int)bufferSize);
- for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
- {
+ bool hasDouble = is_extension_available(device, "cl_khr_fp64");
+ bool hasHalf = is_extension_available(device, "cl_khr_fp16");
- if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
+ for (unsigned typeIdx = 0; typeIdx < vecType.size(); typeIdx++)
+ {
+ if (vecType[typeIdx] == kDouble && !hasDouble)
continue;
-
- if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
+ else if (vecType[typeIdx] == kHalf && !hasHalf)
+ continue;
+ else if ((vecType[typeIdx] == kLong || vecType[typeIdx] == kULong)
+ && !gHasLong)
continue;
- for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
+ for (unsigned sizeIdx = 0; vecSizes[sizeIdx] != 0; sizeIdx++)
{
log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
- int error_this_type = test_vload( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, mtData );
+ int error_this_type =
+ test_func_ptr(device, context, queue, vecType[typeIdx],
+ vecSizes[sizeIdx], createFn, bufferSize);
if (error_this_type) {
error += error_this_type;
log_error("Failure; skipping further sizes for this type.");
@@ -233,125 +317,59 @@ int test_vloadset(cl_device_id device, cl_context context, cl_command_queue queu
}
}
}
-
- free_mtdata(mtData);
-
return error;
}
#pragma mark -------------------- vload test cases --------------------------
-void create_global_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_global_load_code(std::string &destBuffer, size_t inBufferSize,
+ ExplicitType type, size_t inVectorSize,
+ size_t outVectorSize)
{
- const char *pattern =
- "%s%s"
- "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
- "{\n"
- " int tid = get_global_id( 0 );\n"
- " %s%d tmp = vload%d( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
- " results[ tid ] = tmp;\n"
- "}\n";
-
- const char *patternV3 =
- "%s%s"
- "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
- "{\n"
- " int tid = get_global_id( 0 );\n"
- " %s3 tmp = vload3( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
- " results[ 3*tid ] = tmp.s0;\n"
- " results[ 3*tid+1 ] = tmp.s1;\n"
- " results[ 3*tid+2 ] = tmp.s2;\n"
- "}\n";
-
+ std::snprintf(mem_type, sizeof(mem_type), "__global");
+ std::snprintf(store_str, sizeof(store_str), store_patternV3);
const char *typeName = get_explicit_type_name(type);
- if(inVectorSize == 3) {
- sprintf( destBuffer, patternV3,
- type == kDouble ? doubleExtensionPragma : "",
- "",
- typeName, typeName, typeName, typeName );
- } else {
- sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "",
- "",
- typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
- (int)inVectorSize, typeName );
+ std::string outTypeName = typeName;
+ if (inVectorSize != 3)
+ {
+ outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+ std::snprintf(store_str, sizeof(store_str), store_pattern);
}
+
+ std::string kernel_src = concat_kernel(
+ kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0]));
+ destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(),
+ typeName, (int)inVectorSize, (int)inVectorSize);
}
int test_vload_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
{
- return test_vloadset( device, context, queue, create_global_load_code, 10240 );
+ return test_vset<test_vload>(device, context, queue,
+ create_global_load_code, 10240);
}
-
-void create_local_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_local_load_code(std::string &destBuffer, size_t inBufferSize,
+ ExplicitType type, size_t inVectorSize,
+ size_t outVectorSize)
{
- const char *pattern =
- "%s%s"
- //" __local %s%d sSharedStorage[ %d ];\n"
- "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
- "{\n"
- " int tid = get_global_id( 0 );\n"
- " int lid = get_local_id( 0 );\n"
- "\n"
- " if( lid == 0 )\n"
- " {\n"
- " for( int i = 0; i < %d; i++ )\n"
- " sSharedStorage[ i ] = src[ i ];\n"
- " }\n"
- // Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
- // threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
- // updated on all threads at that point
- " barrier( CLK_LOCAL_MEM_FENCE );\n"
- "\n"
- " %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
- " results[ tid ] = tmp;\n"
- "}\n";
-
- const char *patternV3 =
- "%s%s"
- //" __local %s%d sSharedStorage[ %d ];\n"
- "__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
- "{\n"
- " int tid = get_global_id( 0 );\n"
- " int lid = get_local_id( 0 );\n"
- "\n"
- " if( lid == 0 )\n"
- " {\n"
- " for( int i = 0; i < %d; i++ ) {\n"
- " sSharedStorage[ 3*i ] = src[ 3*i ];\n"
- " sSharedStorage[ 3*i +1] = src[ 3*i +1];\n"
- " sSharedStorage[ 3*i +2] = src[ 3*i +2];\n"
- " }\n"
- " }\n"
- // Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
- // threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
- // updated on all threads at that point
- " barrier( CLK_LOCAL_MEM_FENCE );\n"
- "\n"
- " %s3 tmp = vload3( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
- " results[ 3*tid ] = tmp.s0;\n"
- " results[ 3*tid +1] = tmp.s1;\n"
- " results[ 3*tid +2] = tmp.s2;\n"
- "}\n";
-
+ std::snprintf(store_str, sizeof(store_str), store_patternV3);
+ std::snprintf(load_str, sizeof(load_str), load_patternV3);
const char *typeName = get_explicit_type_name(type);
- if(inVectorSize == 3) {
- sprintf( destBuffer, patternV3,
- type == kDouble ? doubleExtensionPragma : "",
- "",
- typeName, /*(int)inBufferSize,*/
- typeName, typeName,
- (int)inBufferSize,
- typeName, typeName );
- } else {
- sprintf( destBuffer, pattern,
- type == kDouble ? doubleExtensionPragma : "",
- "",
- typeName, (int)inVectorSize, /*(int)inBufferSize,*/
- typeName, (int)inVectorSize, typeName, (int)outVectorSize,
- (int)inBufferSize,
- typeName, (int)inVectorSize, (int)inVectorSize, typeName );
+ std::string outTypeName = typeName;
+ std::string inTypeName = typeName;
+ if (inVectorSize != 3)
+ {
+ outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+ inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize);
+ std::snprintf(store_str, sizeof(store_str), store_pattern);
+ std::snprintf(load_str, sizeof(load_str), load_pattern);
}
+
+ std::string kernel_src = concat_kernel(
+ pattern_local, sizeof(pattern_local) / sizeof(pattern_local[0]));
+ destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), inTypeName.c_str(),
+ outTypeName.c_str(), (int)inBufferSize, typeName,
+ (int)inVectorSize, (int)inVectorSize, typeName);
}
int test_vload_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
@@ -360,53 +378,34 @@ int test_vload_local(cl_device_id device, cl_context context, cl_command_queue q
cl_ulong localSize;
int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
test_error( error, "Unable to get max size of local memory buffer" );
- if( localSize > 10240 )
- localSize = 10240;
+ if (localSize > 10240) localSize = 10240;
if (localSize > 4096)
localSize -= 2048;
else
localSize /= 2;
- return test_vloadset( device, context, queue, create_local_load_code, (size_t)localSize );
+ return test_vset<test_vload>(device, context, queue, create_local_load_code,
+ (size_t)localSize);
}
-
-void create_constant_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_constant_load_code(std::string &destBuffer, size_t inBufferSize,
+ ExplicitType type, size_t inVectorSize,
+ size_t outVectorSize)
{
- const char *pattern =
- "%s%s"
- "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
- "{\n"
- " int tid = get_global_id( 0 );\n"
- " %s%d tmp = vload%d( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
- " results[ tid ] = tmp;\n"
- "}\n";
-
- const char *patternV3 =
- "%s%s"
- "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
- "{\n"
- " int tid = get_global_id( 0 );\n"
- " %s3 tmp = vload3( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
- " results[ 3*tid ] = tmp.s0;\n"
- " results[ 3*tid+1 ] = tmp.s1;\n"
- " results[ 3*tid+2 ] = tmp.s2;\n"
- "}\n";
-
+ std::snprintf(mem_type, sizeof(mem_type), "__constant");
+ std::snprintf(store_str, sizeof(store_str), store_patternV3);
const char *typeName = get_explicit_type_name(type);
- if(inVectorSize == 3) {
- sprintf( destBuffer, patternV3,
- type == kDouble ? doubleExtensionPragma : "",
- "",
- typeName, typeName, typeName,
- typeName );
- } else {
- sprintf( destBuffer, pattern,
- type == kDouble ? doubleExtensionPragma : "",
- "",
- typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
- (int)inVectorSize, typeName );
+ std::string outTypeName = typeName;
+ if (inVectorSize != 3)
+ {
+ outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+ std::snprintf(store_str, sizeof(store_str), store_pattern);
}
+
+ std::string kernel_src = concat_kernel(
+ kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0]));
+ destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(),
+ typeName, (int)inVectorSize, (int)inVectorSize);
}
int test_vload_constant(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
@@ -415,109 +414,71 @@ int test_vload_constant(cl_device_id device, cl_context context, cl_command_queu
cl_ulong maxSize;
int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL );
test_error( error, "Unable to get max size of constant memory buffer" );
- if( maxSize > 10240 )
- maxSize = 10240;
+ if (maxSize > 10240) maxSize = 10240;
if (maxSize > 4096)
maxSize -= 2048;
else
maxSize /= 2;
- return test_vloadset( device, context, queue, create_constant_load_code, (size_t)maxSize );
+ return test_vset<test_vload>(device, context, queue,
+ create_constant_load_code, (size_t)maxSize);
}
-
-void create_private_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_private_load_code(std::string &destBuffer, size_t inBufferSize,
+ ExplicitType type, size_t inVectorSize,
+ size_t outVectorSize)
{
- const char *pattern =
- "%s%s"
- // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
- // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
- "#define PRIV_TYPE %s%d\n"
- "#define PRIV_SIZE %d\n"
- "__kernel void test_fn( __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
- "{\n"
- " __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
- " int tid = get_global_id( 0 );\n"
- "\n"
- " for( int i = 0; i < %d; i++ )\n"
- " sPrivateStorage[ i ] = src[ i ];\n"
- // Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
- // anybody else to sync up
- "\n"
- " %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
- " results[ tid ] = tmp;\n"
- "}\n";
-
- const char *patternV3 =
- "%s%s"
- // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
- // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
- "#define PRIV_TYPE %s\n"
- "#define PRIV_SIZE %d\n"
- "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
- "{\n"
- " __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
- " int tid = get_global_id( 0 );\n"
- "\n"
- " for( int i = 0; i < PRIV_SIZE; i++ )\n"
- " {\n"
- " sPrivateStorage[ i ] = src[ i ];\n"
- " }\n"
- // Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
- // anybody else to sync up
- "\n"
- " %s3 tmp = vload3( offsets[ tid ], ( sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
- " results[ 3*tid ] = tmp.s0;\n"
- " results[ 3*tid+1 ] = tmp.s1;\n"
- " results[ 3*tid+2 ] = tmp.s2;\n"
- "}\n";
-
+ std::snprintf(store_str, sizeof(store_str), store_patternV3);
const char *typeName = get_explicit_type_name(type);
- if(inVectorSize ==3) {
- sprintf( destBuffer, patternV3,
- type == kDouble ? doubleExtensionPragma : "",
- "",
- typeName, 3*((int)inBufferSize),
- typeName, typeName,
- typeName );
- // log_info("Src is \"\n%s\n\"\n", destBuffer);
- } else {
- sprintf( destBuffer, pattern,
- type == kDouble ? doubleExtensionPragma : "",
- "",
- typeName, (int)inVectorSize, (int)inBufferSize,
- typeName, (int)inVectorSize, typeName, (int)outVectorSize,
- (int)inBufferSize,
- typeName, (int)inVectorSize, (int)inVectorSize, typeName );
+ std::string outTypeName = typeName;
+ std::string inTypeName = typeName;
+ int bufSize = (int)inBufferSize * 3;
+ if (inVectorSize != 3)
+ {
+ outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+ inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize);
+ bufSize = (int)inBufferSize;
+ std::snprintf(store_str, sizeof(store_str), store_pattern);
}
+
+ std::string kernel_src = concat_kernel(
+ pattern_priv, sizeof(pattern_priv) / sizeof(pattern_priv[0]));
+ destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), bufSize,
+ inTypeName.c_str(), outTypeName.c_str(), typeName,
+ (int)inVectorSize, (int)inVectorSize, typeName);
}
int test_vload_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
{
// We have no idea how much actual private storage is available, so just pick a reasonable value,
// which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
- return test_vloadset( device, context, queue, create_private_load_code, 256 );
+ return test_vset<test_vload>(device, context, queue,
+ create_private_load_code, 256);
}
-
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma mark -------------------- vstore harness --------------------------
-typedef void (*create_vstore_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize );
-
-int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
- create_vstore_program_fn createFn, size_t bufferSize, MTdata d )
+int test_vstore(cl_device_id device, cl_context context, cl_command_queue queue,
+ ExplicitType type, unsigned int vecSize,
+ create_program_fn createFn, size_t bufferSize)
{
- int error;
-
clProgramWrapper program;
clKernelWrapper kernel;
clMemWrapper streams[ 3 ];
+ MTdataHolder d(gRandomSeed);
size_t threads[ 1 ], localThreads[ 1 ];
-
size_t numElements, typeSize, numStores = (DEBUG) ? 16 : NUM_LOADS;
+ pragma_str[0] = '\0';
+ if (type == kDouble)
+ std::snprintf(pragma_str, sizeof(pragma_str),
+ "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
+ else if (type == kHalf)
+ std::snprintf(pragma_str, sizeof(pragma_str),
+ "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
+
if (DEBUG)
bufferSize = (bufferSize < 128) ? bufferSize : 128;
@@ -534,39 +495,22 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
}
if (DEBUG)
log_info("Testing: numStores: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numStores, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
-#if !(defined(_WIN32) && defined(_MSC_VER))
- cl_uint offsets[ numStores ];
-#else
- cl_uint* offsets = (cl_uint*)_malloca(numStores * sizeof(cl_uint));
-#endif
- char programSrc[ 10240 ];
- size_t i;
-
-#if !(defined(_WIN32) && defined(_MSC_VER))
- char inBuffer[ numStores * typeSize * vecSize ];
-#else
- char* inBuffer = (char*)_malloca( numStores * typeSize * vecSize * sizeof(cl_char));
-#endif
+
+ std::vector<cl_uint> offsets(numStores);
+ std::vector<char> inBuffer(numStores * typeSize * vecSize);
+
clProtectedArray outBuffer( numElements * typeSize * vecSize );
-#if !(defined(_WIN32) && defined(_MSC_VER))
- char referenceBuffer[ numElements * typeSize * vecSize ];
-#else
- char* referenceBuffer = (char*)_malloca(numElements * typeSize * vecSize * sizeof(cl_char));
-#endif
+ std::vector<char> referenceBuffer(numElements * typeSize * vecSize);
// Create some random input data and random offsets to load from
- generate_random_data( type, numStores * vecSize, d, (void *)inBuffer );
+ generate_random_data(type, numStores * vecSize, d, (void *)inBuffer.data());
// Note: make sure no two offsets are the same, otherwise the output would depend on
// the order that threads ran in, and that would be next to impossible to verify
-#if !(defined(_WIN32) && defined(_MSC_VER))
- char flags[ numElements ];
-#else
- char* flags = (char*)_malloca( numElements * sizeof(char));
-#endif
-
- memset( flags, 0, numElements * sizeof(char) );
- for( i = 0; i < numStores; i++ )
+ std::vector<char> flags(numElements);
+ flags.assign(flags.size(), 0);
+
+ for (size_t i = 0; i < numStores; i++)
{
do
{
@@ -579,13 +523,15 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
if (LINEAR_OFFSETS)
log_info("Offsets set to thread IDs to simplify output.\n");
- createFn( programSrc, numElements, type, vecSize );
+ std::string programSrc;
+ createFn(programSrc, numElements, type, vecSize, vecSize);
// Create our kernel
- const char *ptr = programSrc;
- error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
+ const char *ptr = programSrc.c_str();
+ cl_int error = create_single_kernel_helper(context, &program, &kernel, 1,
+ &ptr, "test_fn");
test_error( error, "Unable to create testing kernel" );
- if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
+ if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str());
// Get the number of args to differentiate the kernels with local storage. (They have 5)
cl_uint numArgs;
@@ -593,9 +539,14 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
test_error( error, "clGetKernelInfo failed");
// Set up parameters
- streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * typeSize * vecSize * sizeof(cl_char), (void *)inBuffer, &error );
+ streams[0] =
+ clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+ numStores * typeSize * vecSize * sizeof(cl_char),
+ (void *)inBuffer.data(), &error);
test_error( error, "Unable to create kernel stream" );
- streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * sizeof(cl_uint), offsets, &error );
+ streams[1] =
+ clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+ numStores * sizeof(cl_uint), offsets.data(), &error);
test_error( error, "Unable to create kernel stream" );
streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numElements * typeSize * vecSize, (void *)outBuffer, &error );
test_error( error, "Unable to create kernel stream" );
@@ -606,7 +557,7 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
// We need to set the size of the local storage
error = clSetKernelArg(kernel, 0, bufferSize, NULL);
test_error( error, "clSetKernelArg for buffer failed");
- for( i = 0; i < 3; i++ )
+ for (size_t i = 0; i < 3; i++)
{
error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
test_error( error, "Unable to set kernel argument" );
@@ -615,11 +566,10 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
else
{
// No local storage
- for( i = 0; i < 3; i++ )
+ for (size_t i = 0; i < 3; i++)
{
error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
- if (error)
- log_info("%s\n", programSrc);
+ if (error) log_info("%s\n", programSrc.c_str());
test_error( error, "Unable to set kernel argument" );
}
}
@@ -654,25 +604,26 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
error = clEnqueueReadBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
test_error( error, "Unable to read results" );
-
// Create the reference results
- memset( referenceBuffer, 0, numElements * typeSize * vecSize * sizeof(cl_char) );
- for( i = 0; i < numStores; i++ )
+ referenceBuffer.assign(referenceBuffer.size(), 0);
+ for (size_t i = 0; i < numStores; i++)
{
- memcpy( referenceBuffer + ( ( offsets[ i ] * vecSize ) + addressOffset ) * typeSize, inBuffer + i * typeSize * vecSize, typeSize * vecSize );
+ memcpy(&referenceBuffer[((offsets[i] * vecSize) + addressOffset)
+ * typeSize],
+ &inBuffer[i * typeSize * vecSize], typeSize * vecSize);
}
// Validate the results now
- char *expected = referenceBuffer;
+ char *expected = referenceBuffer.data();
char *actual = (char *)(void *)outBuffer;
if (DEBUG)
{
log_info("Memory contents:\n");
- for (i=0; i<numElements; i++)
+ char inString[1024];
+ char expectedString[1024], actualString[1024];
+ for (size_t i = 0; i < numElements; i++)
{
- char inString[1024];
- char expectedString[ 1024 ], actualString[ 1024 ];
if (i < numStores)
{
log_info("buffer %3d: input: %s expected: %s got: %s (store offset %3d)", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
@@ -693,7 +644,7 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
}
}
- for( i = 0; i < numElements; i++ )
+ for (size_t i = 0; i < numElements; i++)
{
if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
{
@@ -719,62 +670,26 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
actual += typeSize * vecSize;
}
}
-
return 0;
}
-int test_vstoreset(cl_device_id device, cl_context context, cl_command_queue queue, create_vstore_program_fn createFn, size_t bufferSize )
-{
- ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
- unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
- const char *size_names[] = { "2", "3", "4", "8", "16"};
- unsigned int typeIdx, sizeIdx;
- int error = 0;
- MTdata d = init_genrand( gRandomSeed );
-
- log_info("Testing with buffer size of %d.\n", (int)bufferSize);
-
- for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
- {
- if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
- continue;
-
- if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
- continue;
-
- for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
- {
- log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
-
- int error_this_type = test_vstore( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, d );
- if (error_this_type)
- {
- log_error("Failure; skipping further sizes for this type.\n");
- error += error_this_type;
- break;
- }
- }
- }
-
- free_mtdata(d);
- return error;
-}
-
-
#pragma mark -------------------- vstore test cases --------------------------
-void create_global_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
+void create_global_store_code(std::string &destBuffer, size_t inBufferSize,
+ ExplicitType type, size_t inVectorSize,
+ size_t /*unused*/)
{
- const char *pattern =
- "%s"
+ // clang-format off
+ const char *pattern [] = {
+ pragma_str,
"__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
" vstore%d( srcValues[ tid ], offsets[ tid ], destBuffer + alignmentOffset );\n"
- "}\n";
+ "}\n" };
- const char *patternV3 =
- "%s"
+ const char *patternV3 [] = {
+ pragma_str,
"__kernel void test_fn( __global %s3 *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
@@ -783,45 +698,48 @@ void create_global_store_code( char *destBuffer, size_t inBufferSize, ExplicitTy
" } else {\n"
" vstore3( vload3(tid, (__global %s *)srcValues), offsets[ tid ], destBuffer + alignmentOffset );\n"
" }\n"
- "}\n";
+ "}\n" };
+ // clang-format on
const char *typeName = get_explicit_type_name(type);
-
if(inVectorSize == 3) {
- sprintf( destBuffer, patternV3,
- type == kDouble ? doubleExtensionPragma : "",
- typeName, typeName, typeName);
-
- } else {
- sprintf( destBuffer, pattern,
- type == kDouble ? doubleExtensionPragma : "",
- typeName, (int)inVectorSize, typeName, (int)inVectorSize );
+ std::string kernel_src =
+ concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
+ destBuffer = str_sprintf(kernel_src, typeName, typeName, typeName);
+ }
+ else
+ {
+ std::string kernel_src =
+ concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
+ destBuffer = str_sprintf(kernel_src, typeName, (int)inVectorSize,
+ typeName, (int)inVectorSize);
}
- // if(inVectorSize == 3 || inVectorSize == 4) {
- // log_info("\n----\n%s\n----\n", destBuffer);
- // }
}
int test_vstore_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
{
- return test_vstoreset( device, context, queue, create_global_store_code, 10240 );
+ return test_vset<test_vstore>(device, context, queue,
+ create_global_store_code, 10240);
}
-
-void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
+void create_local_store_code(std::string &destBuffer, size_t inBufferSize,
+ ExplicitType type, size_t inVectorSize,
+ size_t /*unused*/)
{
- const char *pattern =
- "%s"
- "\n"
- "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
+ // clang-format off
+ const char *pattern[] = {
+ pragma_str,
+ "#define LOC_TYPE %s\n"
+ "#define LOC_VTYPE %s%d\n"
+ "__kernel void test_fn(__local LOC_VTYPE *sSharedStorage, __global LOC_VTYPE *srcValues, __global uint *offsets, __global LOC_VTYPE *destBuffer, uint alignmentOffset )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
// We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
- " sSharedStorage[ offsets[tid] ] = (%s%d)(%s)0;\n"
+ " sSharedStorage[ offsets[tid] ] = (LOC_VTYPE)(LOC_TYPE)0;\n"
" sSharedStorage[ offsets[tid] +1 ] = sSharedStorage[ offsets[tid] ];\n"
" barrier( CLK_LOCAL_MEM_FENCE );\n"
"\n"
- " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local %s *)sSharedStorage ) + alignmentOffset );\n"
+ " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local LOC_TYPE *)sSharedStorage ) + alignmentOffset );\n"
"\n"
// Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
// buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
@@ -830,20 +748,20 @@ void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitTyp
// Note: we only copy the relevant portion of our local storage over to the dest buffer, because
// otherwise, local threads would be overwriting results from other local threads
" int i;\n"
- " __local %s *sp = (__local %s*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
- " __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
+ " __local LOC_TYPE *sp = (__local LOC_TYPE*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
+ " __global LOC_TYPE *dp = (__global LOC_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
" for( i = 0; (size_t)i < sizeof( sSharedStorage[0]) / sizeof( *sp ); i++ ) \n"
" dp[i] = sp[i];\n"
- "}\n";
+ "}\n" };
- const char *patternV3 =
- "%s"
- "\n"
- "__kernel void test_fn(__local %s *sSharedStorage, __global %s *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
+ const char *patternV3 [] = {
+ pragma_str,
+ "#define LOC_TYPE %s\n"
+ "__kernel void test_fn(__local LOC_TYPE *sSharedStorage, __global LOC_TYPE *srcValues, __global uint *offsets, __global LOC_TYPE *destBuffer, uint alignmentOffset )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
// We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
- " sSharedStorage[ 3*offsets[tid] ] = (%s)0;\n"
+ " sSharedStorage[ 3*offsets[tid] ] = (LOC_TYPE)0;\n"
" sSharedStorage[ 3*offsets[tid] +1 ] = \n"
" sSharedStorage[ 3*offsets[tid] ];\n"
" sSharedStorage[ 3*offsets[tid] +2 ] = \n"
@@ -865,30 +783,26 @@ void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitTyp
// Note: we only copy the relevant portion of our local storage over to the dest buffer, because
// otherwise, local threads would be overwriting results from other local threads
" int i;\n"
- " __local %s *sp = (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
- " __global %s *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
+ " __local LOC_TYPE *sp = (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
+ " __global LOC_TYPE *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
" for( i = 0; i < 3; i++ ) \n"
" dp[i] = sp[i];\n"
- "}\n";
+ "}\n" };
+ // clang-format on
const char *typeName = get_explicit_type_name(type);
if(inVectorSize == 3) {
- sprintf( destBuffer, patternV3,
- type == kDouble ? doubleExtensionPragma : "",
- typeName,
- typeName,
- typeName, typeName,
- typeName, typeName, typeName );
- } else {
- sprintf( destBuffer, pattern,
- type == kDouble ? doubleExtensionPragma : "",
- typeName, (int)inVectorSize,
- typeName, (int)inVectorSize, typeName, (int)inVectorSize,
- typeName, (int)inVectorSize, typeName,
- (int)inVectorSize, typeName, typeName,
- typeName, typeName, typeName );
+ std::string kernel_src =
+ concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
+ destBuffer = str_sprintf(kernel_src, typeName);
+ }
+ else
+ {
+ std::string kernel_src =
+ concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
+ destBuffer = str_sprintf(kernel_src, typeName, typeName,
+ (int)inVectorSize, (int)inVectorSize);
}
- // log_info(destBuffer);
}
int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
@@ -897,81 +811,82 @@ int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue
cl_ulong localSize;
int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
test_error( error, "Unable to get max size of local memory buffer" );
- if( localSize > 10240 )
- localSize = 10240;
+ if (localSize > 10240) localSize = 10240;
if (localSize > 4096)
localSize -= 2048;
else
localSize /= 2;
- return test_vstoreset( device, context, queue, create_local_store_code, (size_t)localSize );
+ return test_vset<test_vstore>(device, context, queue,
+ create_local_store_code, (size_t)localSize);
}
-
-void create_private_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
+void create_private_store_code(std::string &destBuffer, size_t inBufferSize,
+ ExplicitType type, size_t inVectorSize,
+ size_t /*unused*/)
{
- const char *pattern =
- "%s"
+ // clang-format off
+ const char *pattern [] = {
+ pragma_str,
+ "#define PRIV_TYPE %s\n"
+ "#define PRIV_VTYPE %s%d\n"
// Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
// for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
"\n"
- "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
+ "__kernel void test_fn( __global PRIV_VTYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n"
"{\n"
- " __private %s%d sPrivateStorage[ %d ];\n"
- " int tid = get_global_id( 0 );\n"
+ " __private PRIV_VTYPE sPrivateStorage[ %d ];\n"
+ " int tid = get_global_id( 0 );\n"
// We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
- " sPrivateStorage[tid] = (%s%d)(%s)0;\n"
+ " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n"
"\n"
- " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
+ " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n"
"\n"
// Note: we only copy the relevant portion of our local storage over to the dest buffer, because
// otherwise, local threads would be overwriting results from other local threads
" uint i;\n"
- " __private %s *sp = (__private %s*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
- " __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
+ " __private PRIV_TYPE *sp = (__private PRIV_TYPE*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
+ " __global PRIV_TYPE *dp = (__global PRIV_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
" for( i = 0; i < sizeof( sPrivateStorage[0]) / sizeof( *sp ); i++ ) \n"
" dp[i] = sp[i];\n"
- "}\n";
-
+ "}\n"};
- const char *patternV3 =
- "%s"
+ const char *patternV3 [] = {
+ pragma_str,
+ "#define PRIV_TYPE %s\n"
+ "#define PRIV_VTYPE %s3\n"
// Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
// for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
"\n"
- "__kernel void test_fn( __global %s *srcValues, __global uint *offsets, __global %s3 *destBuffer, uint alignmentOffset )\n"
+ "__kernel void test_fn( __global PRIV_TYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n"
"{\n"
- " __private %s3 sPrivateStorage[ %d ];\n" // keep this %d
- " int tid = get_global_id( 0 );\n"
+ " __private PRIV_VTYPE sPrivateStorage[ %d ];\n" // keep this %d
+ " int tid = get_global_id( 0 );\n"
// We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
- " sPrivateStorage[tid] = (%s3)(%s)0;\n"
+ " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n"
"\n"
-
- " vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
- "\n"
- // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
- // otherwise, local threads would be overwriting results from other local threads
+ " vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n"
" uint i;\n"
- " __private %s *sp = ((__private %s*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
- " __global %s *dp = ((__global %s*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
+ " __private PRIV_TYPE *sp = ((__private PRIV_TYPE*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
+ " __global PRIV_TYPE *dp = ((__global PRIV_TYPE*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
" for( i = 0; i < 3; i++ ) \n"
" dp[i] = sp[i];\n"
- "}\n";
+ "}\n"};
+ // clang-format on
const char *typeName = get_explicit_type_name(type);
if(inVectorSize == 3) {
- sprintf( destBuffer, patternV3,
- type == kDouble ? doubleExtensionPragma : "",
- typeName, typeName,
- typeName, (int)inBufferSize,
- typeName, typeName,
- typeName, typeName, typeName, typeName, typeName );
- } else {
- sprintf( destBuffer, pattern,
- type == kDouble ? doubleExtensionPragma : "",
- typeName, (int)inVectorSize, typeName, (int)inVectorSize,
- typeName, (int)inVectorSize, (int)inBufferSize,
- typeName, (int)inVectorSize, typeName,
- (int)inVectorSize, typeName, typeName, typeName, typeName, typeName );
+ std::string kernel_src =
+ concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
+ destBuffer =
+ str_sprintf(kernel_src, typeName, typeName, (int)inBufferSize);
+ }
+ else
+ {
+ std::string kernel_src =
+ concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
+ destBuffer =
+ str_sprintf(kernel_src, typeName, typeName, (int)inVectorSize,
+ (int)inBufferSize, (int)inVectorSize);
}
}
@@ -979,7 +894,8 @@ int test_vstore_private(cl_device_id device, cl_context context, cl_command_queu
{
// We have no idea how much actual private storage is available, so just pick a reasonable value,
// which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
- return test_vstoreset( device, context, queue, create_private_store_code, 256 );
+ return test_vset<test_vstore>(device, context, queue,
+ create_private_store_code, 256);
}