summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2023-05-07 20:02:52 +0200
committerGitHub <noreply@github.com>2023-05-07 20:02:52 +0200
commit3d0c227fa5004067d857c74f7963876b34ed4170 (patch)
tree3ff7c3766f1c2f13717ecc4a819da0815fa7cdb3
parent1573f7d3040ddb6005e71bf4f770566f627236d2 (diff)
AMAX/AMIN integer testing and bug fixes (#457)
* Fixed a bug in XAMAX/XMIN routines that caused the increment and offset to be included in the result * Perform proper integer-output testing in XAMAX tests * A few changes towards getting it ready for a PR * Also fix compilation for clBLAS and cuBLAS references * Fix a bug that would only use the real part of complex numbers in the amax/amin routines * A few small fixes related to the AMAX tests
-rw-r--r--CHANGELOG3
-rw-r--r--scripts/generator/generator/cpp.py20
-rw-r--r--scripts/generator/generator/routine.py15
-rw-r--r--src/kernels/level1/xamax.opencl4
-rw-r--r--src/utilities/utilities.hpp1
-rw-r--r--test/correctness/testblas.cpp15
-rw-r--r--test/performance/client.cpp3
-rw-r--r--test/routines/level1/xamax.hpp27
-rw-r--r--test/test_utilities.cpp2
-rw-r--r--test/test_utilities.hpp2
-rw-r--r--test/wrapper_cblas.hpp29
-rw-r--r--test/wrapper_clblas.hpp15
-rw-r--r--test/wrapper_cublas.hpp12
13 files changed, 91 insertions, 57 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 9409cae7..3b84ffc7 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -5,6 +5,9 @@ Development version (next version)
* Toggle OpenCL kernel compilation options for Adreno
* New preprocessor directive RELAX_WORKGROUP_SIZE
- Fixed a bug in handling of #undef in CLBlast loop unrolling and array-to-register mapping functions
+- Fixed a bug in XAMAX/XAMIN routines related to inadvertently including the increment and offset in the result
+- Fixed a bug in XAMAX/XAMIN routines that would cause only the real part of a complex number to be taken into account
+- Fixed a bug that caused tests to not properly do integer-output testing (for XAMAX/XAMIN)
- Fixed a documentation bug in the 'ld' requirements
- Added tuned parameters for various devices (see doc/tuning.md)
diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py
index 6dc3fc93..e32738ee 100644
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@@ -226,7 +226,10 @@ def wrapper_clblas(routine):
# Convert to float (note: also integer buffers are stored as half/float)
for buf in routine.inputs + routine.outputs:
- result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL
+ if buf not in routine.index_buffers():
+ result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL
+ else:
+ result += " auto " + buf + "_buffer_bis = " + buf + "_buffer;" + NL
# Call the float routine
result += " auto status = clblasX" + routine.name + "("
@@ -236,7 +239,8 @@ def wrapper_clblas(routine):
# Convert back to half
for buf in routine.outputs:
- result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL
+ if buf not in routine.index_buffers():
+ result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL
result += " return status;"
# Complete
@@ -276,10 +280,6 @@ def wrapper_cblas(routine):
extra_argument += "," + NL + indent
extra_argument += "reinterpret_cast<return_pointer_" + flavour.buffer_type[:-1] + ">"
extra_argument += "(&" + output_buffer + "_buffer[" + output_buffer + "_offset])"
- elif output_buffer in routine.index_buffers():
- assignment = "reinterpret_cast<int*>(&" + output_buffer + "_buffer[0])[" + output_buffer + "_offset] = static_cast<int>("
- postpostfix = ")"
- indent += " " * (len(assignment) + 1)
else:
assignment = output_buffer + "_buffer[" + output_buffer + "_offset]"
if flavour.name in ["Sc", "Dz"]:
@@ -299,7 +299,10 @@ def wrapper_cblas(routine):
# Convert to float (note: also integer buffers are stored as half/float)
for buf in routine.inputs + routine.outputs:
- result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer);" + NL
+ if buf not in routine.index_buffers():
+ result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer);" + NL
+ else:
+ result += " auto " + buf + "_buffer_bis = " + buf + "_buffer;" + NL
# Call the float routine
result += " cblasX" + routine.name + "("
@@ -308,7 +311,8 @@ def wrapper_cblas(routine):
# Convert back to half
for buf in routine.outputs:
- result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis);" + NL
+ if buf not in routine.index_buffers():
+ result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis);" + NL
# Complete
result += "}" + NL
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
index 8b6ab57f..c2201c0d 100644
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@@ -282,7 +282,10 @@ class Routine:
"""As above but for OpenCL"""
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
- a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"]
+ if name == "imax":
+ a = [prefix + "Buffer<unsigned int>& " + name + "_buffer"]
+ else:
+ a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"]
b = ["const size_t " + name + "_offset"]
c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
return [", ".join(a + b + c)]
@@ -292,7 +295,10 @@ class Routine:
"""As above but for CUDA"""
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
- a = [prefix + flavour.buffer_type + "* " + name + "_buffer"]
+ if name == "imax":
+ a = [prefix + "unsigned int * " + name + "_buffer"]
+ else:
+ a = [prefix + flavour.buffer_type + "* " + name + "_buffer"]
b = ["const size_t " + name + "_offset"]
c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
return [", ".join(a + b + c)]
@@ -302,7 +308,10 @@ class Routine:
"""As above but as vectors"""
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
- a = [prefix + "std::vector<" + flavour.buffer_type + ">& " + name + "_buffer"]
+ if name == "imax":
+ a = [prefix + "std::vector<unsigned int>& " + name + "_buffer"]
+ else:
+ a = [prefix + "std::vector<" + flavour.buffer_type + ">& " + name + "_buffer"]
b = ["const size_t " + name + "_offset"]
c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
return [", ".join(a + b + c)]
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 06a6773b..7cbbd6b5 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -55,7 +55,7 @@ void Xamax(const int n,
while (id < n) {
const int x_index = id*x_inc + x_offset;
#if PRECISION == 3232 || PRECISION == 6464
- singlereal x = xgm[x_index].x;
+ singlereal x = fabs(xgm[x_index].x) + fabs(xgm[x_index].y);
#else
singlereal x = xgm[x_index];
#endif
@@ -70,7 +70,7 @@ void Xamax(const int n,
#endif
if (x > max) {
max = x;
- imax = id*x_inc + x_offset;
+ imax = id;
}
id += WGS1*num_groups;
}
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index b66df118..3ed51dd6 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -133,6 +133,7 @@ constexpr auto kBufMatB = "B";
constexpr auto kBufMatC = "C";
constexpr auto kBufMatAP = "AP";
constexpr auto kBufScalar = "Scalar";
+constexpr auto kBufScalarUint = "ScalarUint";
// =================================================================================================
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index d28aba40..0d0ce7c6 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -124,6 +124,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
auto c_mat2 = Buffer<T>(context_, args.c_size);
auto ap_mat2 = Buffer<T>(context_, args.ap_size);
auto scalar2 = Buffer<T>(context_, args.scalar_size);
+ auto scalar_uint2 = Buffer<unsigned int>(context_, args.scalar_size);
x_vec2.Write(queue_, args.x_size, x_source_);
y_vec2.Write(queue_, args.y_size, y_source_);
a_mat2.Write(queue_, args.a_size, a_source_);
@@ -131,7 +132,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
c_mat2.Write(queue_, args.c_size, c_source_);
ap_mat2.Write(queue_, args.ap_size, ap_source_);
scalar2.Write(queue_, args.scalar_size, scalar_source_);
- auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
+ auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2, scalar_uint2};
// Runs CLBlast
if (verbose_) {
@@ -158,6 +159,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
auto c_mat1 = Buffer<T>(context_, args.c_size);
auto ap_mat1 = Buffer<T>(context_, args.ap_size);
auto scalar1 = Buffer<T>(context_, args.scalar_size);
+ auto scalar_uint1 = Buffer<unsigned int>(context_, args.scalar_size);
x_vec1.Write(queue_, args.x_size, x_source_);
y_vec1.Write(queue_, args.y_size, y_source_);
a_mat1.Write(queue_, args.a_size, a_source_);
@@ -165,7 +167,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
c_mat1.Write(queue_, args.c_size, c_source_);
ap_mat1.Write(queue_, args.ap_size, ap_source_);
scalar1.Write(queue_, args.scalar_size, scalar_source_);
- auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
+ auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1, scalar_uint1};
// Runs the reference code
if (verbose_) {
@@ -221,6 +223,9 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
// Checks for differences in the 'canary' region to detect buffer overflows
for (auto canary_id=size_t{0}; canary_id<kCanarySize; ++canary_id) {
auto index = get_index_(args, get_id1_(args) - 1, get_id2_(args) - 1) + canary_id;
+ if (index >= result1.size() || index >= result2.size()) {
+ continue;
+ }
if (!TestSimilarity(result1[index], result2[index])) {
errors++;
if (verbose_) {
@@ -273,6 +278,7 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
auto c_mat1 = CreateInvalidBuffer<T>(context_, args.c_size);
auto ap_mat1 = CreateInvalidBuffer<T>(context_, args.ap_size);
auto scalar1 = CreateInvalidBuffer<T>(context_, args.scalar_size);
+ auto scalar_uint1 = CreateInvalidBuffer<unsigned int>(context_, args.scalar_size);
auto x_vec2 = CreateInvalidBuffer<T>(context_, args.x_size);
auto y_vec2 = CreateInvalidBuffer<T>(context_, args.y_size);
auto a_mat2 = CreateInvalidBuffer<T>(context_, args.a_size);
@@ -280,8 +286,9 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
auto c_mat2 = CreateInvalidBuffer<T>(context_, args.c_size);
auto ap_mat2 = CreateInvalidBuffer<T>(context_, args.ap_size);
auto scalar2 = CreateInvalidBuffer<T>(context_, args.scalar_size);
- auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
- auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
+ auto scalar_uint2 = CreateInvalidBuffer<unsigned int>(context_, args.scalar_size);
+ auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1, scalar_uint1};
+ auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2, scalar_uint2};
// Runs CLBlast
if (verbose_) {
diff --git a/test/performance/client.cpp b/test/performance/client.cpp
index 34891429..e6930203 100644
--- a/test/performance/client.cpp
+++ b/test/performance/client.cpp
@@ -246,6 +246,7 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
auto c_mat = Buffer<T>(context, args.c_size);
auto ap_mat = Buffer<T>(context, args.ap_size);
auto scalar = Buffer<T>(context, args.scalar_size);
+ auto scalar_uint = Buffer<unsigned int>(context, args.scalar_size);
x_vec.Write(queue, args.x_size, x_source);
y_vec.Write(queue, args.y_size, y_source);
a_mat.Write(queue, args.a_size, a_source);
@@ -253,7 +254,7 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
c_mat.Write(queue, args.c_size, c_source);
ap_mat.Write(queue, args.ap_size, ap_source);
scalar.Write(queue, args.scalar_size, scalar_source);
- auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar};
+ auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar, scalar_uint};
// Runs the routines and collects the timings
auto timings = std::vector<std::pair<std::string, TimeResult>>();
diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp
index 71c1a0ec..aa0f8e10 100644
--- a/test/routines/level1/xamax.hpp
+++ b/test/routines/level1/xamax.hpp
@@ -35,15 +35,15 @@ class TestXamax {
kArgXInc,
kArgXOffset, kArgImaxOffset};
}
- static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalar}; }
- static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
+ static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalarUint}; }
+ static std::vector<std::string> BuffersOut() { return {kBufScalarUint}; }
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) {
return args.n * args.x_inc + args.x_offset;
}
static size_t GetSizeImax(const Arguments<T> &args) {
- return (1 + args.imax_offset) * 2; // always a 4-byte integer, this is a hack for FP16
+ return args.imax_offset + 1;
}
// Describes how to set the sizes of all the buffers
@@ -73,13 +73,13 @@ class TestXamax {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Amax<T>(args.n,
- buffers.scalar(), args.imax_offset,
+ buffers.scalar_uint(), args.imax_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
&queue_plain, &event);
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
#elif CUDA_API
auto status = Amax<T>(args.n,
- buffers.scalar(), args.imax_offset,
+ buffers.scalar_uint(), args.imax_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
queue.GetContext()(), queue.GetDevice()());
cuStreamSynchronize(queue());
@@ -93,7 +93,7 @@ class TestXamax {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXamax<T>(args.n,
- buffers.scalar, args.imax_offset,
+ buffers.scalar_uint, args.imax_offset,
buffers.x_vec, args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
@@ -105,7 +105,7 @@ class TestXamax {
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
cblasXamax(args.n,
- buffers_host.scalar, args.imax_offset,
+ buffers_host.scalar_uint, args.imax_offset,
buffers_host.x_vec, args.x_offset, args.x_inc);
return StatusCode::kSuccess;
}
@@ -115,7 +115,7 @@ class TestXamax {
#ifdef CLBLAST_REF_CUBLAS
static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
auto status = cublasXamax(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n,
- buffers.scalar, args.imax_offset,
+ buffers.scalar_uint, args.imax_offset,
buffers.x_vec, args.x_offset, args.x_inc);
if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
}
@@ -123,8 +123,15 @@ class TestXamax {
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- std::vector<T> result(args.scalar_size, static_cast<T>(0));
- buffers.scalar.Read(queue, args.scalar_size, result);
+ std::vector<unsigned int> result_uint(args.scalar_size, 0);
+ buffers.scalar_uint.Read(queue, args.scalar_size, result_uint);
+ // The result is an integer. However, since the test infrastructure assumes results of
+ // type 'T' (float/double/float2/double2/half), we store the results into T instead.
+ // The values might then become meaningless, but a comparison for testing should still
+ // be valid to verify correctness.
+ auto result_as_T = static_cast<T>(result_uint[0]);
+ std::vector<T> result(args.scalar_size);
+ result[0] = result_as_T;
return result;
}
diff --git a/test/test_utilities.cpp b/test/test_utilities.cpp
index 8029d259..f3116238 100644
--- a/test/test_utilities.cpp
+++ b/test/test_utilities.cpp
@@ -52,6 +52,7 @@ void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T>
else if (name == kBufMatC) { buffers_host.c_mat = std::vector<T>(args.c_size, static_cast<T>(0)); buffers.c_mat.Read(queue, args.c_size, buffers_host.c_mat); }
else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector<T>(args.ap_size, static_cast<T>(0)); buffers.ap_mat.Read(queue, args.ap_size, buffers_host.ap_mat); }
else if (name == kBufScalar) { buffers_host.scalar = std::vector<T>(args.scalar_size, static_cast<T>(0)); buffers.scalar.Read(queue, args.scalar_size, buffers_host.scalar); }
+ else if (name == kBufScalarUint) { buffers_host.scalar_uint = std::vector<unsigned int>(args.scalar_size, 0); buffers.scalar_uint.Read(queue, args.scalar_size, buffers_host.scalar_uint); }
else { throw std::runtime_error("Invalid buffer name"); }
}
}
@@ -67,6 +68,7 @@ void HostToDevice(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T>
else if (name == kBufMatC) { buffers.c_mat.Write(queue, args.c_size, buffers_host.c_mat); }
else if (name == kBufMatAP) { buffers.ap_mat.Write(queue, args.ap_size, buffers_host.ap_mat); }
else if (name == kBufScalar) { buffers.scalar.Write(queue, args.scalar_size, buffers_host.scalar); }
+ else if (name == kBufScalarUint) { buffers.scalar_uint.Write(queue, args.scalar_size, buffers_host.scalar_uint); }
else { throw std::runtime_error("Invalid buffer name"); }
}
}
diff --git a/test/test_utilities.hpp b/test/test_utilities.hpp
index 7bf5e65f..4ea5cdfb 100644
--- a/test/test_utilities.hpp
+++ b/test/test_utilities.hpp
@@ -56,6 +56,7 @@ struct Buffers {
Buffer<T> c_mat;
Buffer<T> ap_mat;
Buffer<T> scalar;
+ Buffer<unsigned int> scalar_uint;
};
template <typename T>
struct BuffersHost {
@@ -66,6 +67,7 @@ struct BuffersHost {
std::vector<T> c_mat;
std::vector<T> ap_mat;
std::vector<T> scalar;
+ std::vector<unsigned int> scalar_uint;
};
// =================================================================================================
diff --git a/test/wrapper_cblas.hpp b/test/wrapper_cblas.hpp
index a47ff725..fe5c7a43 100644
--- a/test/wrapper_cblas.hpp
+++ b/test/wrapper_cblas.hpp
@@ -455,38 +455,37 @@ void cblasXasum(const size_t n,
// Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
void cblasXamax(const size_t n,
- std::vector<float>& imax_buffer, const size_t imax_offset,
+ std::vector<unsigned int>& imax_buffer, const size_t imax_offset,
const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
- reinterpret_cast<int*>(&imax_buffer[0])[imax_offset] = static_cast<int>(cblas_isamax(static_cast<int>(n),
- &x_buffer[x_offset], static_cast<int>(x_inc)));
+ imax_buffer[imax_offset] = cblas_isamax(static_cast<int>(n),
+ &x_buffer[x_offset], static_cast<int>(x_inc));
}
void cblasXamax(const size_t n,
- std::vector<double>& imax_buffer, const size_t imax_offset,
+ std::vector<unsigned int>& imax_buffer, const size_t imax_offset,
const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
- reinterpret_cast<int*>(&imax_buffer[0])[imax_offset] = static_cast<int>(cblas_idamax(static_cast<int>(n),
- &x_buffer[x_offset], static_cast<int>(x_inc)));
+ imax_buffer[imax_offset] = cblas_idamax(static_cast<int>(n),
+ &x_buffer[x_offset], static_cast<int>(x_inc));
}
void cblasXamax(const size_t n,
- std::vector<float2>& imax_buffer, const size_t imax_offset,
+ std::vector<unsigned int>& imax_buffer, const size_t imax_offset,
const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
- reinterpret_cast<int*>(&imax_buffer[0])[imax_offset] = static_cast<int>(cblas_icamax(static_cast<int>(n),
- reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc)));
+ imax_buffer[imax_offset] = cblas_icamax(static_cast<int>(n),
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
}
void cblasXamax(const size_t n,
- std::vector<double2>& imax_buffer, const size_t imax_offset,
+ std::vector<unsigned int>& imax_buffer, const size_t imax_offset,
const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
- reinterpret_cast<int*>(&imax_buffer[0])[imax_offset] = static_cast<int>(cblas_izamax(static_cast<int>(n),
- reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)));
+ imax_buffer[imax_offset] = cblas_izamax(static_cast<int>(n),
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
}
void cblasXamax(const size_t n,
- std::vector<half>& imax_buffer, const size_t imax_offset,
+ std::vector<unsigned int>& imax_buffer, const size_t imax_offset,
const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) {
auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
- auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer);
+ auto imax_buffer_bis = imax_buffer;
cblasXamax(n,
imax_buffer_bis, imax_offset,
x_buffer_bis, x_offset, x_inc);
- FloatToHalfBuffer(imax_buffer, imax_buffer_bis);
}
// =================================================================================================
diff --git a/test/wrapper_clblas.hpp b/test/wrapper_clblas.hpp
index f1b3a0c4..8fdff2bc 100644
--- a/test/wrapper_clblas.hpp
+++ b/test/wrapper_clblas.hpp
@@ -744,13 +744,13 @@ clblasStatus clblasXasum<half>(const size_t n,
// Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
template <typename T>
clblasStatus clblasXamax(const size_t n,
- Buffer<T>& imax_buffer, const size_t imax_offset,
+ Buffer<unsigned int>& imax_buffer, const size_t imax_offset,
const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
template <>
clblasStatus clblasXamax<float>(const size_t n,
- Buffer<float>& imax_buffer, const size_t imax_offset,
+ Buffer<unsigned int>& imax_buffer, const size_t imax_offset,
const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
@@ -765,7 +765,7 @@ clblasStatus clblasXamax<float>(const size_t n,
}
template <>
clblasStatus clblasXamax<double>(const size_t n,
- Buffer<double>& imax_buffer, const size_t imax_offset,
+ Buffer<unsigned int>& imax_buffer, const size_t imax_offset,
const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
@@ -780,7 +780,7 @@ clblasStatus clblasXamax<double>(const size_t n,
}
template <>
clblasStatus clblasXamax<float2>(const size_t n,
- Buffer<float2>& imax_buffer, const size_t imax_offset,
+ Buffer<unsigned int>& imax_buffer, const size_t imax_offset,
const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
@@ -795,7 +795,7 @@ clblasStatus clblasXamax<float2>(const size_t n,
}
template <>
clblasStatus clblasXamax<double2>(const size_t n,
- Buffer<double2>& imax_buffer, const size_t imax_offset,
+ Buffer<unsigned int>& imax_buffer, const size_t imax_offset,
const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
@@ -810,17 +810,16 @@ clblasStatus clblasXamax<double2>(const size_t n,
}
template <>
clblasStatus clblasXamax<half>(const size_t n,
- Buffer<half>& imax_buffer, const size_t imax_offset,
+ Buffer<unsigned int>& imax_buffer, const size_t imax_offset,
const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
- auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer, queues[0]);
+ auto imax_buffer_bis = imax_buffer;
auto status = clblasXamax(n,
imax_buffer_bis, imax_offset,
x_buffer_bis, x_offset, x_inc,
num_queues, queues, num_wait_events, wait_events, events);
- FloatToHalfBuffer(imax_buffer, imax_buffer_bis, queues[0]);
return status;
}
diff --git a/test/wrapper_cublas.hpp b/test/wrapper_cublas.hpp
index 35b1b9c6..0a10a5ed 100644
--- a/test/wrapper_cublas.hpp
+++ b/test/wrapper_cublas.hpp
@@ -576,11 +576,11 @@ cublasStatus_t cublasXasum<half>(cublasHandle_t handle, const size_t n,
// Forwards the cuBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
template <typename T>
cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n,
- T* imax_buffer, const size_t imax_offset,
+ unsigned int * imax_buffer, const size_t imax_offset,
const T* x_buffer, const size_t x_offset, const size_t x_inc);
template <>
cublasStatus_t cublasXamax<float>(cublasHandle_t handle, const size_t n,
- float* imax_buffer, const size_t imax_offset,
+ unsigned int * imax_buffer, const size_t imax_offset,
const float* x_buffer, const size_t x_offset, const size_t x_inc) {
auto status = cublasIsamax(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
@@ -590,7 +590,7 @@ cublasStatus_t cublasXamax<float>(cublasHandle_t handle, const size_t n,
}
template <>
cublasStatus_t cublasXamax<double>(cublasHandle_t handle, const size_t n,
- double* imax_buffer, const size_t imax_offset,
+ unsigned int * imax_buffer, const size_t imax_offset,
const double* x_buffer, const size_t x_offset, const size_t x_inc) {
auto status = cublasIdamax(handle, static_cast<int>(n),
&x_buffer[x_offset], static_cast<int>(x_inc),
@@ -600,7 +600,7 @@ cublasStatus_t cublasXamax<double>(cublasHandle_t handle, const size_t n,
}
template <>
cublasStatus_t cublasXamax<float2>(cublasHandle_t handle, const size_t n,
- float2* imax_buffer, const size_t imax_offset,
+ unsigned int * imax_buffer, const size_t imax_offset,
const float2* x_buffer, const size_t x_offset, const size_t x_inc) {
auto status = cublasIcamax(handle, static_cast<int>(n),
reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -610,7 +610,7 @@ cublasStatus_t cublasXamax<float2>(cublasHandle_t handle, const size_t n,
}
template <>
cublasStatus_t cublasXamax<double2>(cublasHandle_t handle, const size_t n,
- double2* imax_buffer, const size_t imax_offset,
+ unsigned int * imax_buffer, const size_t imax_offset,
const double2* x_buffer, const size_t x_offset, const size_t x_inc) {
auto status = cublasIzamax(handle, static_cast<int>(n),
reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -620,7 +620,7 @@ cublasStatus_t cublasXamax<double2>(cublasHandle_t handle, const size_t n,
}
template <>
cublasStatus_t cublasXamax<half>(cublasHandle_t handle, const size_t n,
- half* imax_buffer, const size_t imax_offset,
+ unsigned int * imax_buffer, const size_t imax_offset,
const half* x_buffer, const size_t x_offset, const size_t x_inc) {
return CUBLAS_STATUS_NOT_SUPPORTED;
}