From 3d0c227fa5004067d857c74f7963876b34ed4170 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 7 May 2023 20:02:52 +0200 Subject: AMAX/AMIN integer testing and bug fixes (#457) * Fixed a bug in XAMAX/XMIN routines that caused the increment and offset to be included in the result * Perform proper integer-output testing in XAMAX tests * A few changes towards getting it ready for a PR * Also fix compilation for clBLAS and cuBLAS references * Fix a bug that would only use the real part of complex numbers in the amax/amin routines * A few small fixes related to the AMAX tests --- CHANGELOG | 3 +++ scripts/generator/generator/cpp.py | 20 ++++++++++++-------- scripts/generator/generator/routine.py | 15 ++++++++++++--- src/kernels/level1/xamax.opencl | 4 ++-- src/utilities/utilities.hpp | 1 + test/correctness/testblas.cpp | 15 +++++++++++---- test/performance/client.cpp | 3 ++- test/routines/level1/xamax.hpp | 27 +++++++++++++++++---------- test/test_utilities.cpp | 2 ++ test/test_utilities.hpp | 2 ++ test/wrapper_cblas.hpp | 29 ++++++++++++++--------------- test/wrapper_clblas.hpp | 15 +++++++-------- test/wrapper_cublas.hpp | 12 ++++++------ 13 files changed, 91 insertions(+), 57 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 9409cae7..3b84ffc7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -5,6 +5,9 @@ Development version (next version) * Toggle OpenCL kernel compilation options for Adreno * New preprocessor directive RELAX_WORKGROUP_SIZE - Fixed a bug in handling of #undef in CLBlast loop unrolling and array-to-register mapping functions +- Fixed a bug in XAMAX/XAMIN routines related to inadvertently including the increment and offset in the result +- Fixed a bug in XAMAX/XAMIN routines that would cause only the real part of a complex number to be taken into account +- Fixed a bug that caused tests to not properly do integer-output testing (for XAMAX/XAMIN) - Fixed a documentation bug in the 'ld' requirements - Added tuned parameters for various devices (see doc/tuning.md) diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 6dc3fc93..e32738ee 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -226,7 +226,10 @@ def wrapper_clblas(routine): # Convert to float (note: also integer buffers are stored as half/float) for buf in routine.inputs + routine.outputs: - result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL + if buf not in routine.index_buffers(): + result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL + else: + result += " auto " + buf + "_buffer_bis = " + buf + "_buffer;" + NL # Call the float routine result += " auto status = clblasX" + routine.name + "(" @@ -236,7 +239,8 @@ def wrapper_clblas(routine): # Convert back to half for buf in routine.outputs: - result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL + if buf not in routine.index_buffers(): + result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL result += " return status;" # Complete @@ -276,10 +280,6 @@ def wrapper_cblas(routine): extra_argument += "," + NL + indent extra_argument += "reinterpret_cast" extra_argument += "(&" + output_buffer + "_buffer[" + output_buffer + "_offset])" - elif output_buffer in routine.index_buffers(): - assignment = "reinterpret_cast(&" + output_buffer + "_buffer[0])[" + output_buffer + "_offset] = static_cast(" - postpostfix = ")" - indent += " " * (len(assignment) + 1) else: assignment = output_buffer + "_buffer[" + output_buffer + "_offset]" if flavour.name in ["Sc", "Dz"]: @@ -299,7 +299,10 @@ def wrapper_cblas(routine): # Convert to float (note: also integer buffers are stored as half/float) for buf in routine.inputs + routine.outputs: - result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer);" + NL + if buf not in routine.index_buffers(): + result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer);" + NL + else: + result += " auto " + buf + "_buffer_bis = " + buf + "_buffer;" + NL # Call the float routine result += " cblasX" + routine.name + "(" @@ -308,7 +311,8 @@ def wrapper_cblas(routine): # Convert back to half for buf in routine.outputs: - result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis);" + NL + if buf not in routine.index_buffers(): + result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis);" + NL # Complete result += "}" + NL diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 8b6ab57f..c2201c0d 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -282,7 +282,10 @@ class Routine: """As above but for OpenCL""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: - a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"] + if name == "imax": + a = [prefix + "Buffer& " + name + "_buffer"] + else: + a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"] b = ["const size_t " + name + "_offset"] c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + b + c)] @@ -292,7 +295,10 @@ class Routine: """As above but for CUDA""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: - a = [prefix + flavour.buffer_type + "* " + name + "_buffer"] + if name == "imax": + a = [prefix + "unsigned int * " + name + "_buffer"] + else: + a = [prefix + flavour.buffer_type + "* " + name + "_buffer"] b = ["const size_t " + name + "_offset"] c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + b + c)] @@ -302,7 +308,10 @@ class Routine: """As above but as vectors""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: - a = [prefix + "std::vector<" + flavour.buffer_type + ">& " + name + "_buffer"] + if name == "imax": + a = [prefix + "std::vector& " + name + "_buffer"] + else: + a = [prefix + "std::vector<" + flavour.buffer_type + ">& " + name + "_buffer"] b = ["const size_t " + name + "_offset"] c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + b + c)] diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 06a6773b..7cbbd6b5 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -55,7 +55,7 @@ void Xamax(const int n, while (id < n) { const int x_index = id*x_inc + x_offset; #if PRECISION == 3232 || PRECISION == 6464 - singlereal x = xgm[x_index].x; + singlereal x = fabs(xgm[x_index].x) + fabs(xgm[x_index].y); #else singlereal x = xgm[x_index]; #endif @@ -70,7 +70,7 @@ void Xamax(const int n, #endif if (x > max) { max = x; - imax = id*x_inc + x_offset; + imax = id; } id += WGS1*num_groups; } diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index b66df118..3ed51dd6 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -133,6 +133,7 @@ constexpr auto kBufMatB = "B"; constexpr auto kBufMatC = "C"; constexpr auto kBufMatAP = "AP"; constexpr auto kBufScalar = "Scalar"; +constexpr auto kBufScalarUint = "ScalarUint"; // ================================================================================================= diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp index d28aba40..0d0ce7c6 100644 --- a/test/correctness/testblas.cpp +++ b/test/correctness/testblas.cpp @@ -124,6 +124,7 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st auto c_mat2 = Buffer(context_, args.c_size); auto ap_mat2 = Buffer(context_, args.ap_size); auto scalar2 = Buffer(context_, args.scalar_size); + auto scalar_uint2 = Buffer(context_, args.scalar_size); x_vec2.Write(queue_, args.x_size, x_source_); y_vec2.Write(queue_, args.y_size, y_source_); a_mat2.Write(queue_, args.a_size, a_source_); @@ -131,7 +132,7 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st c_mat2.Write(queue_, args.c_size, c_source_); ap_mat2.Write(queue_, args.ap_size, ap_source_); scalar2.Write(queue_, args.scalar_size, scalar_source_); - auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; + auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2, scalar_uint2}; // Runs CLBlast if (verbose_) { @@ -158,6 +159,7 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st auto c_mat1 = Buffer(context_, args.c_size); auto ap_mat1 = Buffer(context_, args.ap_size); auto scalar1 = Buffer(context_, args.scalar_size); + auto scalar_uint1 = Buffer(context_, args.scalar_size); x_vec1.Write(queue_, args.x_size, x_source_); y_vec1.Write(queue_, args.y_size, y_source_); a_mat1.Write(queue_, args.a_size, a_source_); @@ -165,7 +167,7 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st c_mat1.Write(queue_, args.c_size, c_source_); ap_mat1.Write(queue_, args.ap_size, ap_source_); scalar1.Write(queue_, args.scalar_size, scalar_source_); - auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1, scalar_uint1}; // Runs the reference code if (verbose_) { @@ -221,6 +223,9 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st // Checks for differences in the 'canary' region to detect buffer overflows for (auto canary_id=size_t{0}; canary_id= result1.size() || index >= result2.size()) { + continue; + } if (!TestSimilarity(result1[index], result2[index])) { errors++; if (verbose_) { @@ -273,6 +278,7 @@ void TestBlas::TestInvalid(std::vector> &test_vector, const st auto c_mat1 = CreateInvalidBuffer(context_, args.c_size); auto ap_mat1 = CreateInvalidBuffer(context_, args.ap_size); auto scalar1 = CreateInvalidBuffer(context_, args.scalar_size); + auto scalar_uint1 = CreateInvalidBuffer(context_, args.scalar_size); auto x_vec2 = CreateInvalidBuffer(context_, args.x_size); auto y_vec2 = CreateInvalidBuffer(context_, args.y_size); auto a_mat2 = CreateInvalidBuffer(context_, args.a_size); @@ -280,8 +286,9 @@ void TestBlas::TestInvalid(std::vector> &test_vector, const st auto c_mat2 = CreateInvalidBuffer(context_, args.c_size); auto ap_mat2 = CreateInvalidBuffer(context_, args.ap_size); auto scalar2 = CreateInvalidBuffer(context_, args.scalar_size); - auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; - auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; + auto scalar_uint2 = CreateInvalidBuffer(context_, args.scalar_size); + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1, scalar_uint1}; + auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2, scalar_uint2}; // Runs CLBlast if (verbose_) { diff --git a/test/performance/client.cpp b/test/performance/client.cpp index 34891429..e6930203 100644 --- a/test/performance/client.cpp +++ b/test/performance/client.cpp @@ -246,6 +246,7 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) auto c_mat = Buffer(context, args.c_size); auto ap_mat = Buffer(context, args.ap_size); auto scalar = Buffer(context, args.scalar_size); + auto scalar_uint = Buffer(context, args.scalar_size); x_vec.Write(queue, args.x_size, x_source); y_vec.Write(queue, args.y_size, y_source); a_mat.Write(queue, args.a_size, a_source); @@ -253,7 +254,7 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) c_mat.Write(queue, args.c_size, c_source); ap_mat.Write(queue, args.ap_size, ap_source); scalar.Write(queue, args.scalar_size, scalar_source); - auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar}; + auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar, scalar_uint}; // Runs the routines and collects the timings auto timings = std::vector>(); diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp index 71c1a0ec..aa0f8e10 100644 --- a/test/routines/level1/xamax.hpp +++ b/test/routines/level1/xamax.hpp @@ -35,15 +35,15 @@ class TestXamax { kArgXInc, kArgXOffset, kArgImaxOffset}; } - static std::vector BuffersIn() { return {kBufVecX, kBufScalar}; } - static std::vector BuffersOut() { return {kBufScalar}; } + static std::vector BuffersIn() { return {kBufVecX, kBufScalarUint}; } + static std::vector BuffersOut() { return {kBufScalarUint}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n * args.x_inc + args.x_offset; } static size_t GetSizeImax(const Arguments &args) { - return (1 + args.imax_offset) * 2; // always a 4-byte integer, this is a hack for FP16 + return args.imax_offset + 1; } // Describes how to set the sizes of all the buffers @@ -73,13 +73,13 @@ class TestXamax { auto queue_plain = queue(); auto event = cl_event{}; auto status = Amax(args.n, - buffers.scalar(), args.imax_offset, + buffers.scalar_uint(), args.imax_offset, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Amax(args.n, - buffers.scalar(), args.imax_offset, + buffers.scalar_uint(), args.imax_offset, buffers.x_vec(), args.x_offset, args.x_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); @@ -93,7 +93,7 @@ class TestXamax { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXamax(args.n, - buffers.scalar, args.imax_offset, + buffers.scalar_uint, args.imax_offset, buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); @@ -105,7 +105,7 @@ class TestXamax { #ifdef CLBLAST_REF_CBLAS static StatusCode RunReference2(const Arguments &args, BuffersHost &buffers_host, Queue &) { cblasXamax(args.n, - buffers_host.scalar, args.imax_offset, + buffers_host.scalar_uint, args.imax_offset, buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } @@ -115,7 +115,7 @@ class TestXamax { #ifdef CLBLAST_REF_CUBLAS static StatusCode RunReference3(const Arguments &args, BuffersCUDA &buffers, Queue &) { auto status = cublasXamax(reinterpret_cast(args.cublas_handle), args.n, - buffers.scalar, args.imax_offset, + buffers.scalar_uint, args.imax_offset, buffers.x_vec, args.x_offset, args.x_inc); if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } } @@ -123,8 +123,15 @@ class TestXamax { // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { - std::vector result(args.scalar_size, static_cast(0)); - buffers.scalar.Read(queue, args.scalar_size, result); + std::vector result_uint(args.scalar_size, 0); + buffers.scalar_uint.Read(queue, args.scalar_size, result_uint); + // The result is an integer. However, since the test infrastructure assumes results of + // type 'T' (float/double/float2/double2/half), we store the results into T instead. + // The values might then become meaningless, but a comparison for testing should still + // be valid to verify correctness. + auto result_as_T = static_cast(result_uint[0]); + std::vector result(args.scalar_size); + result[0] = result_as_T; return result; } diff --git a/test/test_utilities.cpp b/test/test_utilities.cpp index 8029d259..f3116238 100644 --- a/test/test_utilities.cpp +++ b/test/test_utilities.cpp @@ -52,6 +52,7 @@ void DeviceToHost(const Arguments &args, Buffers &buffers, BuffersHost else if (name == kBufMatC) { buffers_host.c_mat = std::vector(args.c_size, static_cast(0)); buffers.c_mat.Read(queue, args.c_size, buffers_host.c_mat); } else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector(args.ap_size, static_cast(0)); buffers.ap_mat.Read(queue, args.ap_size, buffers_host.ap_mat); } else if (name == kBufScalar) { buffers_host.scalar = std::vector(args.scalar_size, static_cast(0)); buffers.scalar.Read(queue, args.scalar_size, buffers_host.scalar); } + else if (name == kBufScalarUint) { buffers_host.scalar_uint = std::vector(args.scalar_size, 0); buffers.scalar_uint.Read(queue, args.scalar_size, buffers_host.scalar_uint); } else { throw std::runtime_error("Invalid buffer name"); } } } @@ -67,6 +68,7 @@ void HostToDevice(const Arguments &args, Buffers &buffers, BuffersHost else if (name == kBufMatC) { buffers.c_mat.Write(queue, args.c_size, buffers_host.c_mat); } else if (name == kBufMatAP) { buffers.ap_mat.Write(queue, args.ap_size, buffers_host.ap_mat); } else if (name == kBufScalar) { buffers.scalar.Write(queue, args.scalar_size, buffers_host.scalar); } + else if (name == kBufScalarUint) { buffers.scalar_uint.Write(queue, args.scalar_size, buffers_host.scalar_uint); } else { throw std::runtime_error("Invalid buffer name"); } } } diff --git a/test/test_utilities.hpp b/test/test_utilities.hpp index 7bf5e65f..4ea5cdfb 100644 --- a/test/test_utilities.hpp +++ b/test/test_utilities.hpp @@ -56,6 +56,7 @@ struct Buffers { Buffer c_mat; Buffer ap_mat; Buffer scalar; + Buffer scalar_uint; }; template struct BuffersHost { @@ -66,6 +67,7 @@ struct BuffersHost { std::vector c_mat; std::vector ap_mat; std::vector scalar; + std::vector scalar_uint; }; // ================================================================================================= diff --git a/test/wrapper_cblas.hpp b/test/wrapper_cblas.hpp index a47ff725..fe5c7a43 100644 --- a/test/wrapper_cblas.hpp +++ b/test/wrapper_cblas.hpp @@ -455,38 +455,37 @@ void cblasXasum(const size_t n, // Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX void cblasXamax(const size_t n, - std::vector& imax_buffer, const size_t imax_offset, + std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { - reinterpret_cast(&imax_buffer[0])[imax_offset] = static_cast(cblas_isamax(static_cast(n), - &x_buffer[x_offset], static_cast(x_inc))); + imax_buffer[imax_offset] = cblas_isamax(static_cast(n), + &x_buffer[x_offset], static_cast(x_inc)); } void cblasXamax(const size_t n, - std::vector& imax_buffer, const size_t imax_offset, + std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { - reinterpret_cast(&imax_buffer[0])[imax_offset] = static_cast(cblas_idamax(static_cast(n), - &x_buffer[x_offset], static_cast(x_inc))); + imax_buffer[imax_offset] = cblas_idamax(static_cast(n), + &x_buffer[x_offset], static_cast(x_inc)); } void cblasXamax(const size_t n, - std::vector& imax_buffer, const size_t imax_offset, + std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { - reinterpret_cast(&imax_buffer[0])[imax_offset] = static_cast(cblas_icamax(static_cast(n), - reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc))); + imax_buffer[imax_offset] = cblas_icamax(static_cast(n), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXamax(const size_t n, - std::vector& imax_buffer, const size_t imax_offset, + std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { - reinterpret_cast(&imax_buffer[0])[imax_offset] = static_cast(cblas_izamax(static_cast(n), - reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc))); + imax_buffer[imax_offset] = cblas_izamax(static_cast(n), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXamax(const size_t n, - std::vector& imax_buffer, const size_t imax_offset, + std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer); - auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer); + auto imax_buffer_bis = imax_buffer; cblasXamax(n, imax_buffer_bis, imax_offset, x_buffer_bis, x_offset, x_inc); - FloatToHalfBuffer(imax_buffer, imax_buffer_bis); } // ================================================================================================= diff --git a/test/wrapper_clblas.hpp b/test/wrapper_clblas.hpp index f1b3a0c4..8fdff2bc 100644 --- a/test/wrapper_clblas.hpp +++ b/test/wrapper_clblas.hpp @@ -744,13 +744,13 @@ clblasStatus clblasXasum(const size_t n, // Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template clblasStatus clblasXamax(const size_t n, - Buffer& imax_buffer, const size_t imax_offset, + Buffer& imax_buffer, const size_t imax_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXamax(const size_t n, - Buffer& imax_buffer, const size_t imax_offset, + Buffer& imax_buffer, const size_t imax_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { @@ -765,7 +765,7 @@ clblasStatus clblasXamax(const size_t n, } template <> clblasStatus clblasXamax(const size_t n, - Buffer& imax_buffer, const size_t imax_offset, + Buffer& imax_buffer, const size_t imax_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { @@ -780,7 +780,7 @@ clblasStatus clblasXamax(const size_t n, } template <> clblasStatus clblasXamax(const size_t n, - Buffer& imax_buffer, const size_t imax_offset, + Buffer& imax_buffer, const size_t imax_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { @@ -795,7 +795,7 @@ clblasStatus clblasXamax(const size_t n, } template <> clblasStatus clblasXamax(const size_t n, - Buffer& imax_buffer, const size_t imax_offset, + Buffer& imax_buffer, const size_t imax_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { @@ -810,17 +810,16 @@ clblasStatus clblasXamax(const size_t n, } template <> clblasStatus clblasXamax(const size_t n, - Buffer& imax_buffer, const size_t imax_offset, + Buffer& imax_buffer, const size_t imax_offset, const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); - auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer, queues[0]); + auto imax_buffer_bis = imax_buffer; auto status = clblasXamax(n, imax_buffer_bis, imax_offset, x_buffer_bis, x_offset, x_inc, num_queues, queues, num_wait_events, wait_events, events); - FloatToHalfBuffer(imax_buffer, imax_buffer_bis, queues[0]); return status; } diff --git a/test/wrapper_cublas.hpp b/test/wrapper_cublas.hpp index 35b1b9c6..0a10a5ed 100644 --- a/test/wrapper_cublas.hpp +++ b/test/wrapper_cublas.hpp @@ -576,11 +576,11 @@ cublasStatus_t cublasXasum(cublasHandle_t handle, const size_t n, // Forwards the cuBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, - T* imax_buffer, const size_t imax_offset, + unsigned int * imax_buffer, const size_t imax_offset, const T* x_buffer, const size_t x_offset, const size_t x_inc); template <> cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, - float* imax_buffer, const size_t imax_offset, + unsigned int * imax_buffer, const size_t imax_offset, const float* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasIsamax(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), @@ -590,7 +590,7 @@ cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, } template <> cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, - double* imax_buffer, const size_t imax_offset, + unsigned int * imax_buffer, const size_t imax_offset, const double* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasIdamax(handle, static_cast(n), &x_buffer[x_offset], static_cast(x_inc), @@ -600,7 +600,7 @@ cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, } template <> cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, - float2* imax_buffer, const size_t imax_offset, + unsigned int * imax_buffer, const size_t imax_offset, const float2* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasIcamax(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), @@ -610,7 +610,7 @@ cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, } template <> cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, - double2* imax_buffer, const size_t imax_offset, + unsigned int * imax_buffer, const size_t imax_offset, const double2* x_buffer, const size_t x_offset, const size_t x_inc) { auto status = cublasIzamax(handle, static_cast(n), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), @@ -620,7 +620,7 @@ cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, } template <> cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, - half* imax_buffer, const size_t imax_offset, + unsigned int * imax_buffer, const size_t imax_offset, const half* x_buffer, const size_t x_offset, const size_t x_inc) { return CUBLAS_STATUS_NOT_SUPPORTED; } -- cgit v1.2.3