From 3d0c227fa5004067d857c74f7963876b34ed4170 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 7 May 2023 20:02:52 +0200
Subject: AMAX/AMIN integer testing and bug fixes (#457)

* Fixed a bug in XAMAX/XMIN routines that caused the increment and offset to be included in the result

* Perform proper integer-output testing in XAMAX tests

* A few changes towards getting it ready for a PR

* Also fix compilation for clBLAS and cuBLAS references

* Fix a bug that would only use the real part of complex numbers in the amax/amin routines

* A few small fixes related to the AMAX tests
---
 CHANGELOG                              |  3 +++
 scripts/generator/generator/cpp.py     | 20 ++++++++++++--------
 scripts/generator/generator/routine.py | 15 ++++++++++++---
 src/kernels/level1/xamax.opencl        |  4 ++--
 src/utilities/utilities.hpp            |  1 +
 test/correctness/testblas.cpp          | 15 +++++++++++----
 test/performance/client.cpp            |  3 ++-
 test/routines/level1/xamax.hpp         | 27 +++++++++++++++++----------
 test/test_utilities.cpp                |  2 ++
 test/test_utilities.hpp                |  2 ++
 test/wrapper_cblas.hpp                 | 29 ++++++++++++++---------------
 test/wrapper_clblas.hpp                | 15 +++++++--------
 test/wrapper_cublas.hpp                | 12 ++++++------
 13 files changed, 91 insertions(+), 57 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 9409cae7..3b84ffc7 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -5,6 +5,9 @@ Development version (next version)
   * Toggle OpenCL kernel compilation options for Adreno
   * New preprocessor directive RELAX_WORKGROUP_SIZE
 - Fixed a bug in handling of #undef in CLBlast loop unrolling and array-to-register mapping functions
+- Fixed a bug in XAMAX/XAMIN routines related to inadvertently including the increment and offset in the result
+- Fixed a bug in XAMAX/XAMIN routines that would cause only the real part of a complex number to be taken into account
+- Fixed a bug that caused tests to not properly do integer-output testing (for XAMAX/XAMIN)
 - Fixed a documentation bug in the 'ld' requirements
 - Added tuned parameters for various devices (see doc/tuning.md)
 
diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py
index 6dc3fc93..e32738ee 100644
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@@ -226,7 +226,10 @@ def wrapper_clblas(routine):
 
                 # Convert to float (note: also integer buffers are stored as half/float)
                 for buf in routine.inputs + routine.outputs:
-                    result += "  auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL
+                    if buf not in routine.index_buffers():
+                        result += "  auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL
+                    else:
+                        result += "  auto " + buf + "_buffer_bis = " + buf + "_buffer;" + NL
 
                 # Call the float routine
                 result += "  auto status = clblasX" + routine.name + "("
@@ -236,7 +239,8 @@ def wrapper_clblas(routine):
 
                 # Convert back to half
                 for buf in routine.outputs:
-                    result += "  FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL
+                    if buf not in routine.index_buffers():
+                        result += "  FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL
                 result += "  return status;"
 
             # Complete
@@ -276,10 +280,6 @@ def wrapper_cblas(routine):
                             extra_argument += "," + NL + indent
                             extra_argument += "reinterpret_cast<return_pointer_" + flavour.buffer_type[:-1] + ">"
                             extra_argument += "(&" + output_buffer + "_buffer[" + output_buffer + "_offset])"
-                        elif output_buffer in routine.index_buffers():
-                            assignment = "reinterpret_cast<int*>(&" + output_buffer + "_buffer[0])[" + output_buffer + "_offset] = static_cast<int>("
-                            postpostfix = ")"
-                            indent += " " * (len(assignment) + 1)
                         else:
                             assignment = output_buffer + "_buffer[" + output_buffer + "_offset]"
                             if flavour.name in ["Sc", "Dz"]:
@@ -299,7 +299,10 @@ def wrapper_cblas(routine):
 
                 # Convert to float (note: also integer buffers are stored as half/float)
                 for buf in routine.inputs + routine.outputs:
-                    result += "  auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer);" + NL
+                    if buf not in routine.index_buffers():
+                        result += "  auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer);" + NL
+                    else:
+                        result += "  auto " + buf + "_buffer_bis = " + buf + "_buffer;" + NL
 
                 # Call the float routine
                 result += "  cblasX" + routine.name + "("
@@ -308,7 +311,8 @@ def wrapper_cblas(routine):
 
                 # Convert back to half
                 for buf in routine.outputs:
-                    result += "  FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis);" + NL
+                    if buf not in routine.index_buffers():
+                        result += "  FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis);" + NL
 
             # Complete
             result += "}" + NL
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
index 8b6ab57f..c2201c0d 100644
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@@ -282,7 +282,10 @@ class Routine:
         """As above but for OpenCL"""
         prefix = "const " if name in self.inputs else ""
         if name in self.inputs or name in self.outputs:
-            a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"]
+            if name == "imax":
+                a = [prefix + "Buffer<unsigned int>& " + name + "_buffer"]
+            else:
+                a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"]
             b = ["const size_t " + name + "_offset"]
             c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
             return [", ".join(a + b + c)]
@@ -292,7 +295,10 @@ class Routine:
         """As above but for CUDA"""
         prefix = "const " if name in self.inputs else ""
         if name in self.inputs or name in self.outputs:
-            a = [prefix + flavour.buffer_type + "* " + name + "_buffer"]
+            if name == "imax":
+                a = [prefix + "unsigned int * " + name + "_buffer"]
+            else:
+                a = [prefix + flavour.buffer_type + "* " + name + "_buffer"]
             b = ["const size_t " + name + "_offset"]
             c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
             return [", ".join(a + b + c)]
@@ -302,7 +308,10 @@ class Routine:
         """As above but as vectors"""
         prefix = "const " if name in self.inputs else ""
         if name in self.inputs or name in self.outputs:
-            a = [prefix + "std::vector<" + flavour.buffer_type + ">& " + name + "_buffer"]
+            if name == "imax":
+                a = [prefix + "std::vector<unsigned int>& " + name + "_buffer"]
+            else:
+                a = [prefix + "std::vector<" + flavour.buffer_type + ">& " + name + "_buffer"]
             b = ["const size_t " + name + "_offset"]
             c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
             return [", ".join(a + b + c)]
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 06a6773b..7cbbd6b5 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -55,7 +55,7 @@ void Xamax(const int n,
   while (id < n) {
     const int x_index = id*x_inc + x_offset;
     #if PRECISION == 3232 || PRECISION == 6464
-      singlereal x = xgm[x_index].x;
+      singlereal x = fabs(xgm[x_index].x) + fabs(xgm[x_index].y);
     #else
       singlereal x = xgm[x_index];
     #endif
@@ -70,7 +70,7 @@ void Xamax(const int n,
     #endif
     if (x > max) {
       max = x;
-      imax = id*x_inc + x_offset;
+      imax = id;
     }
     id += WGS1*num_groups;
   }
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index b66df118..3ed51dd6 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -133,6 +133,7 @@ constexpr auto kBufMatB = "B";
 constexpr auto kBufMatC = "C";
 constexpr auto kBufMatAP = "AP";
 constexpr auto kBufScalar = "Scalar";
+constexpr auto kBufScalarUint = "ScalarUint";
 
 // =================================================================================================
 
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index d28aba40..0d0ce7c6 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -124,6 +124,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
     auto c_mat2 = Buffer<T>(context_, args.c_size);
     auto ap_mat2 = Buffer<T>(context_, args.ap_size);
     auto scalar2 = Buffer<T>(context_, args.scalar_size);
+    auto scalar_uint2 = Buffer<unsigned int>(context_, args.scalar_size);
     x_vec2.Write(queue_, args.x_size, x_source_);
     y_vec2.Write(queue_, args.y_size, y_source_);
     a_mat2.Write(queue_, args.a_size, a_source_);
@@ -131,7 +132,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
     c_mat2.Write(queue_, args.c_size, c_source_);
     ap_mat2.Write(queue_, args.ap_size, ap_source_);
     scalar2.Write(queue_, args.scalar_size, scalar_source_);
-    auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
+    auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2, scalar_uint2};
 
     // Runs CLBlast
     if (verbose_) {
@@ -158,6 +159,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
     auto c_mat1 = Buffer<T>(context_, args.c_size);
     auto ap_mat1 = Buffer<T>(context_, args.ap_size);
     auto scalar1 = Buffer<T>(context_, args.scalar_size);
+    auto scalar_uint1 = Buffer<unsigned int>(context_, args.scalar_size);
     x_vec1.Write(queue_, args.x_size, x_source_);
     y_vec1.Write(queue_, args.y_size, y_source_);
     a_mat1.Write(queue_, args.a_size, a_source_);
@@ -165,7 +167,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
     c_mat1.Write(queue_, args.c_size, c_source_);
     ap_mat1.Write(queue_, args.ap_size, ap_source_);
     scalar1.Write(queue_, args.scalar_size, scalar_source_);
-    auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
+    auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1, scalar_uint1};
 
     // Runs the reference code
     if (verbose_) {
@@ -221,6 +223,9 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
     // Checks for differences in the 'canary' region to detect buffer overflows
     for (auto canary_id=size_t{0}; canary_id<kCanarySize; ++canary_id) {
       auto index = get_index_(args, get_id1_(args) - 1, get_id2_(args) - 1) + canary_id;
+      if (index >= result1.size() || index >= result2.size()) {
+        continue;
+      }
       if (!TestSimilarity(result1[index], result2[index])) {
         errors++;
         if (verbose_) {
@@ -273,6 +278,7 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
     auto c_mat1 = CreateInvalidBuffer<T>(context_, args.c_size);
     auto ap_mat1 = CreateInvalidBuffer<T>(context_, args.ap_size);
     auto scalar1 = CreateInvalidBuffer<T>(context_, args.scalar_size);
+    auto scalar_uint1 = CreateInvalidBuffer<unsigned int>(context_, args.scalar_size);
     auto x_vec2 = CreateInvalidBuffer<T>(context_, args.x_size);
     auto y_vec2 = CreateInvalidBuffer<T>(context_, args.y_size);
     auto a_mat2 = CreateInvalidBuffer<T>(context_, args.a_size);
@@ -280,8 +286,9 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
     auto c_mat2 = CreateInvalidBuffer<T>(context_, args.c_size);
     auto ap_mat2 = CreateInvalidBuffer<T>(context_, args.ap_size);
     auto scalar2 = CreateInvalidBuffer<T>(context_, args.scalar_size);
-    auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
-    auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
+    auto scalar_uint2 = CreateInvalidBuffer<unsigned int>(context_, args.scalar_size);
+    auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1, scalar_uint1};
+    auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2, scalar_uint2};
 
     // Runs CLBlast
     if (verbose_) {
diff --git a/test/performance/client.cpp b/test/performance/client.cpp
index 34891429..e6930203 100644
--- a/test/performance/client.cpp
+++ b/test/performance/client.cpp
@@ -246,6 +246,7 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
     auto c_mat = Buffer<T>(context, args.c_size);
     auto ap_mat = Buffer<T>(context, args.ap_size);
     auto scalar = Buffer<T>(context, args.scalar_size);
+    auto scalar_uint = Buffer<unsigned int>(context, args.scalar_size);
     x_vec.Write(queue, args.x_size, x_source);
     y_vec.Write(queue, args.y_size, y_source);
     a_mat.Write(queue, args.a_size, a_source);
@@ -253,7 +254,7 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
     c_mat.Write(queue, args.c_size, c_source);
     ap_mat.Write(queue, args.ap_size, ap_source);
     scalar.Write(queue, args.scalar_size, scalar_source);
-    auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar};
+    auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar, scalar_uint};
 
     // Runs the routines and collects the timings
     auto timings = std::vector<std::pair<std::string, TimeResult>>();
diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp
index 71c1a0ec..aa0f8e10 100644
--- a/test/routines/level1/xamax.hpp
+++ b/test/routines/level1/xamax.hpp
@@ -35,15 +35,15 @@ class TestXamax {
             kArgXInc,
             kArgXOffset, kArgImaxOffset};
   }
-  static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalar}; }
-  static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
+  static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalarUint}; }
+  static std::vector<std::string> BuffersOut() { return {kBufScalarUint}; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) {
     return args.n * args.x_inc + args.x_offset;
   }
   static size_t GetSizeImax(const Arguments<T> &args) {
-    return (1 + args.imax_offset) * 2; // always a 4-byte integer, this is a hack for FP16
+    return args.imax_offset + 1;
   }
 
   // Describes how to set the sizes of all the buffers
@@ -73,13 +73,13 @@ class TestXamax {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = Amax<T>(args.n,
-                            buffers.scalar(), args.imax_offset,
+                            buffers.scalar_uint(), args.imax_offset,
                             buffers.x_vec(), args.x_offset, args.x_inc,
                             &queue_plain, &event);
       if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     #elif CUDA_API
       auto status = Amax<T>(args.n,
-                            buffers.scalar(), args.imax_offset,
+                            buffers.scalar_uint(), args.imax_offset,
                             buffers.x_vec(), args.x_offset, args.x_inc,
                             queue.GetContext()(), queue.GetDevice()());
       cuStreamSynchronize(queue());
@@ -93,7 +93,7 @@ class TestXamax {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXamax<T>(args.n,
-                                   buffers.scalar, args.imax_offset,
+                                   buffers.scalar_uint, args.imax_offset,
                                    buffers.x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
@@ -105,7 +105,7 @@ class TestXamax {
   #ifdef CLBLAST_REF_CBLAS
     static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
       cblasXamax(args.n,
-                 buffers_host.scalar, args.imax_offset,
+                 buffers_host.scalar_uint, args.imax_offset,
                  buffers_host.x_vec, args.x_offset, args.x_inc);
       return StatusCode::kSuccess;
     }
@@ -115,7 +115,7 @@ class TestXamax {
   #ifdef CLBLAST_REF_CUBLAS
     static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) {
       auto status = cublasXamax(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n,
-                                buffers.scalar, args.imax_offset,
+                                buffers.scalar_uint, args.imax_offset,
                                 buffers.x_vec, args.x_offset, args.x_inc);
       if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; }
     }
@@ -123,8 +123,15 @@ class TestXamax {
 
   // Describes how to download the results of the computation (more importantly: which buffer)
   static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-    std::vector<T> result(args.scalar_size, static_cast<T>(0));
-    buffers.scalar.Read(queue, args.scalar_size, result);
+    std::vector<unsigned int> result_uint(args.scalar_size, 0);
+    buffers.scalar_uint.Read(queue, args.scalar_size, result_uint);
+    // The result is an integer. However, since the test infrastructure assumes results of
+    // type 'T' (float/double/float2/double2/half), we store the results into T instead.
+    // The values might then become meaningless, but a comparison for testing should still
+    // be valid to verify correctness.
+    auto result_as_T = static_cast<T>(result_uint[0]);
+    std::vector<T> result(args.scalar_size);
+    result[0] = result_as_T;
     return result;
   }
 
diff --git a/test/test_utilities.cpp b/test/test_utilities.cpp
index 8029d259..f3116238 100644
--- a/test/test_utilities.cpp
+++ b/test/test_utilities.cpp
@@ -52,6 +52,7 @@ void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T>
     else if (name == kBufMatC) { buffers_host.c_mat = std::vector<T>(args.c_size, static_cast<T>(0)); buffers.c_mat.Read(queue, args.c_size, buffers_host.c_mat); }
     else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector<T>(args.ap_size, static_cast<T>(0)); buffers.ap_mat.Read(queue, args.ap_size, buffers_host.ap_mat); }
     else if (name == kBufScalar) { buffers_host.scalar = std::vector<T>(args.scalar_size, static_cast<T>(0)); buffers.scalar.Read(queue, args.scalar_size, buffers_host.scalar); }
+    else if (name == kBufScalarUint) { buffers_host.scalar_uint = std::vector<unsigned int>(args.scalar_size, 0); buffers.scalar_uint.Read(queue, args.scalar_size, buffers_host.scalar_uint); }
     else { throw std::runtime_error("Invalid buffer name"); }
   }
 }
@@ -67,6 +68,7 @@ void HostToDevice(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T>
     else if (name == kBufMatC) { buffers.c_mat.Write(queue, args.c_size, buffers_host.c_mat); }
     else if (name == kBufMatAP) { buffers.ap_mat.Write(queue, args.ap_size, buffers_host.ap_mat); }
     else if (name == kBufScalar) { buffers.scalar.Write(queue, args.scalar_size, buffers_host.scalar); }
+    else if (name == kBufScalarUint) { buffers.scalar_uint.Write(queue, args.scalar_size, buffers_host.scalar_uint); }
     else { throw std::runtime_error("Invalid buffer name"); }
   }
 }
diff --git a/test/test_utilities.hpp b/test/test_utilities.hpp
index 7bf5e65f..4ea5cdfb 100644
--- a/test/test_utilities.hpp
+++ b/test/test_utilities.hpp
@@ -56,6 +56,7 @@ struct Buffers {
   Buffer<T> c_mat;
   Buffer<T> ap_mat;
   Buffer<T> scalar;
+  Buffer<unsigned int> scalar_uint;
 };
 template <typename T>
 struct BuffersHost {
@@ -66,6 +67,7 @@ struct BuffersHost {
   std::vector<T> c_mat;
   std::vector<T> ap_mat;
   std::vector<T> scalar;
+  std::vector<unsigned int> scalar_uint;
 };
 
 // =================================================================================================
diff --git a/test/wrapper_cblas.hpp b/test/wrapper_cblas.hpp
index a47ff725..fe5c7a43 100644
--- a/test/wrapper_cblas.hpp
+++ b/test/wrapper_cblas.hpp
@@ -455,38 +455,37 @@ void cblasXasum(const size_t n,
 
 // Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
 void cblasXamax(const size_t n,
-                std::vector<float>& imax_buffer, const size_t imax_offset,
+                std::vector<unsigned int>& imax_buffer, const size_t imax_offset,
                 const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  reinterpret_cast<int*>(&imax_buffer[0])[imax_offset] = static_cast<int>(cblas_isamax(static_cast<int>(n),
-                                                                                       &x_buffer[x_offset], static_cast<int>(x_inc)));
+  imax_buffer[imax_offset] = cblas_isamax(static_cast<int>(n),
+                                         &x_buffer[x_offset], static_cast<int>(x_inc));
 }
 void cblasXamax(const size_t n,
-                std::vector<double>& imax_buffer, const size_t imax_offset,
+                std::vector<unsigned int>& imax_buffer, const size_t imax_offset,
                 const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  reinterpret_cast<int*>(&imax_buffer[0])[imax_offset] = static_cast<int>(cblas_idamax(static_cast<int>(n),
-                                                                                       &x_buffer[x_offset], static_cast<int>(x_inc)));
+  imax_buffer[imax_offset] = cblas_idamax(static_cast<int>(n),
+                                         &x_buffer[x_offset], static_cast<int>(x_inc));
 }
 void cblasXamax(const size_t n,
-                std::vector<float2>& imax_buffer, const size_t imax_offset,
+                std::vector<unsigned int>& imax_buffer, const size_t imax_offset,
                 const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  reinterpret_cast<int*>(&imax_buffer[0])[imax_offset] = static_cast<int>(cblas_icamax(static_cast<int>(n),
-                                                                                       reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc)));
+  imax_buffer[imax_offset] = cblas_icamax(static_cast<int>(n),
+                                         reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
 void cblasXamax(const size_t n,
-                std::vector<double2>& imax_buffer, const size_t imax_offset,
+                std::vector<unsigned int>& imax_buffer, const size_t imax_offset,
                 const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
-  reinterpret_cast<int*>(&imax_buffer[0])[imax_offset] = static_cast<int>(cblas_izamax(static_cast<int>(n),
-                                                                                       reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)));
+  imax_buffer[imax_offset] = cblas_izamax(static_cast<int>(n),
+                                         reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
 void cblasXamax(const size_t n,
-                std::vector<half>& imax_buffer, const size_t imax_offset,
+                std::vector<unsigned int>& imax_buffer, const size_t imax_offset,
                 const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) {
   auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
-  auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer);
+  auto imax_buffer_bis = imax_buffer;
   cblasXamax(n,
              imax_buffer_bis, imax_offset,
              x_buffer_bis, x_offset, x_inc);
-  FloatToHalfBuffer(imax_buffer, imax_buffer_bis);
 }
 
 // =================================================================================================
diff --git a/test/wrapper_clblas.hpp b/test/wrapper_clblas.hpp
index f1b3a0c4..8fdff2bc 100644
--- a/test/wrapper_clblas.hpp
+++ b/test/wrapper_clblas.hpp
@@ -744,13 +744,13 @@ clblasStatus clblasXasum<half>(const size_t n,
 // Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
 template <typename T>
 clblasStatus clblasXamax(const size_t n,
-                         Buffer<T>& imax_buffer, const size_t imax_offset,
+                         Buffer<unsigned int>& imax_buffer, const size_t imax_offset,
                          const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXamax<float>(const size_t n,
-                                Buffer<float>& imax_buffer, const size_t imax_offset,
+                                Buffer<unsigned int>& imax_buffer, const size_t imax_offset,
                                 const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
@@ -765,7 +765,7 @@ clblasStatus clblasXamax<float>(const size_t n,
 }
 template <>
 clblasStatus clblasXamax<double>(const size_t n,
-                                 Buffer<double>& imax_buffer, const size_t imax_offset,
+                                 Buffer<unsigned int>& imax_buffer, const size_t imax_offset,
                                  const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
@@ -780,7 +780,7 @@ clblasStatus clblasXamax<double>(const size_t n,
 }
 template <>
 clblasStatus clblasXamax<float2>(const size_t n,
-                                 Buffer<float2>& imax_buffer, const size_t imax_offset,
+                                 Buffer<unsigned int>& imax_buffer, const size_t imax_offset,
                                  const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
@@ -795,7 +795,7 @@ clblasStatus clblasXamax<float2>(const size_t n,
 }
 template <>
 clblasStatus clblasXamax<double2>(const size_t n,
-                                  Buffer<double2>& imax_buffer, const size_t imax_offset,
+                                  Buffer<unsigned int>& imax_buffer, const size_t imax_offset,
                                   const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
@@ -810,17 +810,16 @@ clblasStatus clblasXamax<double2>(const size_t n,
 }
 template <>
 clblasStatus clblasXamax<half>(const size_t n,
-                               Buffer<half>& imax_buffer, const size_t imax_offset,
+                               Buffer<unsigned int>& imax_buffer, const size_t imax_offset,
                                const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
                                cl_uint num_queues, cl_command_queue *queues,
                                cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
-  auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer, queues[0]);
+  auto imax_buffer_bis = imax_buffer;
   auto status = clblasXamax(n,
                             imax_buffer_bis, imax_offset,
                             x_buffer_bis, x_offset, x_inc,
                             num_queues, queues, num_wait_events, wait_events, events);
-  FloatToHalfBuffer(imax_buffer, imax_buffer_bis, queues[0]);
   return status;
 }
 
diff --git a/test/wrapper_cublas.hpp b/test/wrapper_cublas.hpp
index 35b1b9c6..0a10a5ed 100644
--- a/test/wrapper_cublas.hpp
+++ b/test/wrapper_cublas.hpp
@@ -576,11 +576,11 @@ cublasStatus_t cublasXasum<half>(cublasHandle_t handle, const size_t n,
 // Forwards the cuBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
 template <typename T>
 cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n,
-                           T* imax_buffer, const size_t imax_offset,
+                           unsigned int * imax_buffer, const size_t imax_offset,
                            const T* x_buffer, const size_t x_offset, const size_t x_inc);
 template <>
 cublasStatus_t cublasXamax<float>(cublasHandle_t handle, const size_t n,
-                                  float* imax_buffer, const size_t imax_offset,
+                                  unsigned int * imax_buffer, const size_t imax_offset,
                                   const float* x_buffer, const size_t x_offset, const size_t x_inc) {
   auto status = cublasIsamax(handle, static_cast<int>(n),
                             &x_buffer[x_offset], static_cast<int>(x_inc),
@@ -590,7 +590,7 @@ cublasStatus_t cublasXamax<float>(cublasHandle_t handle, const size_t n,
 }
 template <>
 cublasStatus_t cublasXamax<double>(cublasHandle_t handle, const size_t n,
-                                   double* imax_buffer, const size_t imax_offset,
+                                   unsigned int * imax_buffer, const size_t imax_offset,
                                    const double* x_buffer, const size_t x_offset, const size_t x_inc) {
   auto status = cublasIdamax(handle, static_cast<int>(n),
                             &x_buffer[x_offset], static_cast<int>(x_inc),
@@ -600,7 +600,7 @@ cublasStatus_t cublasXamax<double>(cublasHandle_t handle, const size_t n,
 }
 template <>
 cublasStatus_t cublasXamax<float2>(cublasHandle_t handle, const size_t n,
-                                   float2* imax_buffer, const size_t imax_offset,
+                                   unsigned int * imax_buffer, const size_t imax_offset,
                                    const float2* x_buffer, const size_t x_offset, const size_t x_inc) {
   auto status = cublasIcamax(handle, static_cast<int>(n),
                             reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -610,7 +610,7 @@ cublasStatus_t cublasXamax<float2>(cublasHandle_t handle, const size_t n,
 }
 template <>
 cublasStatus_t cublasXamax<double2>(cublasHandle_t handle, const size_t n,
-                                    double2* imax_buffer, const size_t imax_offset,
+                                    unsigned int * imax_buffer, const size_t imax_offset,
                                     const double2* x_buffer, const size_t x_offset, const size_t x_inc) {
   auto status = cublasIzamax(handle, static_cast<int>(n),
                             reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
@@ -620,7 +620,7 @@ cublasStatus_t cublasXamax<double2>(cublasHandle_t handle, const size_t n,
 }
 template <>
 cublasStatus_t cublasXamax<half>(cublasHandle_t handle, const size_t n,
-                                 half* imax_buffer, const size_t imax_offset,
+                                 unsigned int * imax_buffer, const size_t imax_offset,
                                  const half* x_buffer, const size_t x_offset, const size_t x_inc) {
   return CUBLAS_STATUS_NOT_SUPPORTED;
 }
-- 
cgit v1.2.3