aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteven Winston <gpx1000@users.noreply.github.com>2023-10-03 09:26:48 -0700
committerGitHub <noreply@github.com>2023-10-03 09:26:48 -0700
commit43d6886810919475d95302fb6173de573ffcdef6 (patch)
tree79191ab2ff53b9cd9aab031cc7eba037db1aabf2
parentf39685030f5f72883c0a71400a2423e436870639 (diff)
downloadOpenCL-CTS-43d6886810919475d95302fb6173de573ffcdef6.tar.gz
rewrite test_select to run in a few seconds. (#1665)
* rewrite test_select to run in a few seconds. * removing the threading; reverting to the original method. * Merge from Master, remove all suggested changes and start with a simple change report on each change's cost savings: BEFORE: real 47m8.497s user 48m8.860s sys 0m14.952s AFTER: real 17m53.383s user 18m53.342s sys 0m13.297s initSrcBuffer generates the same random noise every iteration through the loop. There is no change to the arguments, and the host data itself doesn't need to get rewritten. Profiling realizes a 2 times speed accel from simply relying upon the buffer to remain randomized at the next loop iteration. * BEFORE: real 17m53.383s user 18m53.342s sys 0m13.297s AFTER: real 12m26.035s user 13m15.505s sys 0m15.414s rearrange a few things in the loops to allow for vectorized / interleaved loop traversal. NB: not all loops are vectorizable obviously; but this addresses the worst offenders. Also note, to enable compiler to generate vectorized and interleaved loop traversal build with -o3. * address the CI format requirements. * address the CI format requirements. * address the CI format requirements.
-rw-r--r--test_conformance/select/test_select.cpp56
1 files changed, 38 insertions, 18 deletions
diff --git a/test_conformance/select/test_select.cpp b/test_conformance/select/test_select.cpp
index e9009e49..72be08c7 100644
--- a/test_conformance/select/test_select.cpp
+++ b/test_conformance/select/test_select.cpp
@@ -47,11 +47,14 @@ static void initSrcBuffer(void* src1, Type stype, MTdata);
// initialize the valued used to compare with in the select with
// vlaues [start, count)
-static void initCmpBuffer(void* cmp, Type cmptype, uint64_t start, size_t count);
+static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start,
+ const size_t count);
// make a program that uses select for the given stype (src/dest type),
// ctype (comparison type), veclen (vector length)
-static cl_program makeSelectProgram(cl_kernel *kernel_ptr, const cl_context context, Type stype, Type ctype, size_t veclen );
+static cl_program makeSelectProgram(cl_kernel *kernel_ptr, cl_context context,
+ Type stype, Type ctype,
+ const size_t veclen);
// Creates and execute the select test for the given device, context,
// stype (source/dest type), cmptype (comparison type), using max_tg_size
@@ -121,7 +124,9 @@ static void initSrcBuffer(void* src1, Type stype, MTdata d)
s1[i] = genrand_int32(d);
}
-static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start, size_t count)
+static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start,
+ const size_t count)
+
{
assert(cmptype != kfloat);
switch (type_size[cmptype]) {
@@ -144,11 +149,12 @@ static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start, size_t count)
// The short test doesn't iterate over the entire 32 bit space so
// we alternate between positive and negative values
int32_t* ui = (int32_t *)cmp;
- int32_t sign = 1;
- for (size_t i = 0; i < count; ++i, ++start)
+ int32_t neg_start = (int32_t)start * -1;
+ for (size_t i = 0; i < count; i++)
{
- ui[i] = (int32_t)start*sign;
- sign = sign * -1;
+ ++start;
+ --neg_start;
+ ui[i] = (int32_t)((i % 2) ? start : neg_start);
}
}
break;
@@ -157,11 +163,12 @@ static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start, size_t count)
// We don't iterate over the entire space of 64 bit so for the
// selects, we want to test positive and negative values
int64_t* ll = (int64_t *)cmp;
- int64_t sign = 1;
- for (size_t i = 0; i < count; ++i, ++start)
+ int64_t neg_start = (int64_t)start * -1;
+ for (size_t i = 0; i < count; i++)
{
- ll[i] = start*sign;
- sign = sign * -1;
+ ++start;
+ --neg_start;
+ ll[i] = (int64_t)((i % 2) ? start : neg_start);
}
break;
}
@@ -173,7 +180,9 @@ static void initCmpBuffer(void *cmp, Type cmptype, uint64_t start, size_t count)
// Make the various incarnations of the program we want to run
// stype: source and destination type for the select
// ctype: compare type
-static cl_program makeSelectProgram(cl_kernel *kernel_ptr, const cl_context context, Type srctype, Type cmptype, size_t vec_len)
+static cl_program makeSelectProgram(cl_kernel *kernel_ptr,
+ const cl_context context, Type srctype,
+ Type cmptype, const size_t vec_len)
{
char testname[256];
char stypename[32];
@@ -309,7 +318,7 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
clMemWrapper src1, src2, cmp, dest;
cl_ulong blocks = type_size[stype] * 0x100000000ULL / BUFFER_SIZE;
- size_t block_elements = BUFFER_SIZE / type_size[stype];
+ const size_t block_elements = BUFFER_SIZE / type_size[stype];
size_t step = s_wimpy_mode ? s_wimpy_reduction_factor : 1;
cl_ulong cmp_stride = block_elements * step;
@@ -355,10 +364,21 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
dest = clCreateBuffer( context, CL_MEM_WRITE_ONLY, BUFFER_SIZE, NULL, &err );
test_error_count(err, "Error: could not allocate dest buffer\n");
- for (int vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
+ programs[0] = makeSelectProgram(&kernels[0], context, stype, cmptype,
+ element_count[0]);
+ programs[1] = makeSelectProgram(&kernels[1], context, stype, cmptype,
+ element_count[1]);
+ programs[2] = makeSelectProgram(&kernels[2], context, stype, cmptype,
+ element_count[2]);
+ programs[3] = makeSelectProgram(&kernels[3], context, stype, cmptype,
+ element_count[3]);
+ programs[4] = makeSelectProgram(&kernels[4], context, stype, cmptype,
+ element_count[4]);
+ programs[5] = makeSelectProgram(&kernels[5], context, stype, cmptype,
+ element_count[5]);
+
+ for (size_t vecsize = 0; vecsize < VECTOR_SIZE_COUNT; ++vecsize)
{
- programs[vecsize] = makeSelectProgram(&kernels[vecsize], context, stype,
- cmptype, element_count[vecsize]);
if (!programs[vecsize] || !kernels[vecsize])
{
return -1;
@@ -391,10 +411,10 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
log_info("Testing...");
uint64_t i;
+ initSrcBuffer(src1_host.data(), stype, d);
+ initSrcBuffer(src2_host.data(), stype, d);
for (i=0; i < blocks; i+=step)
{
- initSrcBuffer(src1_host.data(), stype, d);
- initSrcBuffer(src2_host.data(), stype, d);
initCmpBuffer(cmp_host.data(), cmptype, i * cmp_stride, block_elements);
err = clEnqueueWriteBuffer(queue, src1, CL_FALSE, 0, BUFFER_SIZE,