6 files changed, 95 insertions, 98 deletions
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index e94b4d30..d43b9b7c 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -100,7 +100,9 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   if (unit_diagonal == 0) {
     const real diagonal_value = lm[thread_index][thread_index];
     if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix
-      DivideReal(inverted_diagonal, inverted_diagonal, diagonal_value);
+      real constant_one;
+      SetToOne(constant_one);
+      DivideReal(inverted_diagonal, constant_one, diagonal_value);
     }
   }
   lm[thread_index][thread_index] = inverted_diagonal;
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index b2ed2f0c..9cf75490 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -24,100 +24,52 @@ namespace clblast {
 // =================================================================================================
 
 // Returns a scalar with a default value
-template <typename T>
-T GetScalar() {
-  return static_cast<T>(2.0);
-}
+template <typename T> T GetScalar() { return static_cast<T>(2.0); }
 template float GetScalar<float>();
 template double GetScalar<double>();
-
-// Specialized version of the above for half-precision
-template <>
-half GetScalar() {
-  return FloatToHalf(2.0f);
-}
-
-// Specialized versions of the above for complex data-types
-template <>
-float2 GetScalar() {
-  return {2.0f, 0.5f};
-}
-template <>
-double2 GetScalar() {
-  return {2.0, 0.5};
-}
+template <> half GetScalar() { return FloatToHalf(2.0f); }
+template <> float2 GetScalar() { return {2.0f, 0.5f}; }
+template <> double2 GetScalar() { return {2.0, 0.5}; }
 
 // Returns a scalar of value 0
-template <typename T>
-T ConstantZero() {
-  return static_cast<T>(0.0);
-}
+template <typename T> T ConstantZero() { return static_cast<T>(0.0); }
 template float ConstantZero<float>();
 template double ConstantZero<double>();
-
-// Specialized version of the above for half-precision
-template <>
-half ConstantZero() {
-  return FloatToHalf(0.0f);
-}
-
-// Specialized versions of the above for complex data-types
-template <>
-float2 ConstantZero() {
-  return {0.0f, 0.0f};
-}
-template <>
-double2 ConstantZero() {
-  return {0.0, 0.0};
-}
+template <> half ConstantZero() { return FloatToHalf(0.0f); }
+template <> float2 ConstantZero() { return {0.0f, 0.0f}; }
+template <> double2 ConstantZero() { return {0.0, 0.0}; }
 
 // Returns a scalar of value 1
-template <typename T>
-T ConstantOne() {
-  return static_cast<T>(1.0);
-}
+template <typename T> T ConstantOne() { return static_cast<T>(1.0); }
 template float ConstantOne<float>();
 template double ConstantOne<double>();
-
-// Specialized version of the above for half-precision
-template <>
-half ConstantOne() {
-  return FloatToHalf(1.0f);
-}
-
-// Specialized versions of the above for complex data-types
-template <>
-float2 ConstantOne() {
-  return {1.0f, 0.0f};
-}
-template <>
-double2 ConstantOne() {
-  return {1.0, 0.0};
-}
+template <> half ConstantOne() { return FloatToHalf(1.0f); }
+template <> float2 ConstantOne() { return {1.0f, 0.0f}; }
+template <> double2 ConstantOne() { return {1.0, 0.0}; }
 
 // Returns a scalar of value -1
-template <typename T>
-T ConstantNegOne() {
-  return static_cast<T>(-1.0);
-}
+template <typename T> T ConstantNegOne() { return static_cast<T>(-1.0); }
 template float ConstantNegOne<float>();
 template double ConstantNegOne<double>();
+template <> half ConstantNegOne() { return FloatToHalf(-1.0f); }
+template <> float2 ConstantNegOne() { return {-1.0f, 0.0f}; }
+template <> double2 ConstantNegOne() { return {-1.0, 0.0}; }
 
-// Specialized version of the above for half-precision
-template <>
-half ConstantNegOne() {
-  return FloatToHalf(-1.0f);
-}
-
-// Specialized versions of the above for complex data-types
-template <>
-float2 ConstantNegOne() {
-  return {-1.0f, 0.0f};
-}
-template <>
-double2 ConstantNegOne() {
-  return {-1.0, 0.0};
-}
+// Returns a scalar of value 1
+template <typename T> T ConstantTwo() { return static_cast<T>(2.0); }
+template float ConstantTwo<float>();
+template double ConstantTwo<double>();
+template <> half ConstantTwo() { return FloatToHalf(2.0f); }
+template <> float2 ConstantTwo() { return {2.0f, 0.0f}; }
+template <> double2 ConstantTwo() { return {2.0, 0.0}; }
+
+// Returns a small scalar value just larger than 0
+template <typename T> T SmallConstant() { return static_cast<T>(1e7); }
+template float SmallConstant<float>();
+template double SmallConstant<double>();
+template <> half SmallConstant() { return FloatToHalf(1e7); }
+template <> float2 SmallConstant() { return {1e7, 0.0f}; }
+template <> double2 SmallConstant() { return {1e7, 0.0}; }
 
 // Returns the absolute value of a scalar
 template <typename T> T AbsoluteValue(const T value) { return std::fabs(value); }
@@ -127,6 +79,14 @@ template <> half AbsoluteValue(const half value) { return FloatToHalf(std::fabs(
 template <> float2 AbsoluteValue(const float2 value) { return std::abs(value); }
 template <> double2 AbsoluteValue(const double2 value) { return std::abs(value); }
 
+// Returns whether a scalar is close to zero
+template <typename T> bool IsCloseToZero(const T value) { return (value > -SmallConstant<T>()) && (value < SmallConstant<T>()); }
+template bool IsCloseToZero<float>(const float);
+template bool IsCloseToZero<double>(const double);
+template <> bool IsCloseToZero(const half value) { return IsCloseToZero(HalfToFloat(value)); }
+template <> bool IsCloseToZero(const float2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
+template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
+
 // =================================================================================================
 
 // Implements the string conversion using std::to_string if possible
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 2c13658b..044955ea 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -99,24 +99,20 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
 // =================================================================================================
 
 // Returns a scalar with a default value
-template <typename T>
-T GetScalar();
-
-// Returns a scalar of value 0
-template <typename T>
-T ConstantZero();
-
-// Returns a scalar of value 1
-template <typename T>
-T ConstantOne();
+template <typename T> T GetScalar();
 
-// Returns a scalar of value -1
-template <typename T>
-T ConstantNegOne();
+// Fixed value scalars
+template <typename T> T ConstantZero();
+template <typename T> T ConstantOne();
+template <typename T> T ConstantNegOne();
+template <typename T> T ConstantTwo();
+template <typename T> T SmallConstant();
 
 // Returns the absolute value of a scalar
-template <typename T>
-T AbsoluteValue(const T value);
+template <typename T> T AbsoluteValue(const T value);
+
+// Returns whether a scalar is close to zero
+template <typename T> bool IsCloseToZero(const T value);
 
 // =================================================================================================
 
diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp
index 811feac5..72ebdf9e 100644
--- a/test/routines/level2/xtrsv.hpp
+++ b/test/routines/level2/xtrsv.hpp
@@ -46,10 +46,10 @@ void PrepareData(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto diagonal = a_mat_cpu[i*args.a_ld + i + args.a_offset];
     diagonal = AbsoluteValue(diagonal) + static_cast<T>(args.n / size_t{4});
     for (auto j = size_t{0}; j < args.n; ++j) {
-      a_mat_cpu[j*args.a_ld + i + args.a_offset] /= T{2.0};
+      a_mat_cpu[j*args.a_ld + i + args.a_offset] /= ConstantTwo<T>();
     }
     a_mat_cpu[i*args.a_ld + i + args.a_offset] = diagonal;
-    x_vec_cpu[i * args.x_inc + args.x_offset] /= T{2.0};
+    x_vec_cpu[i * args.x_inc + args.x_offset] /= ConstantTwo<T>();
   }
 
   // Copies input buffers back to the OpenCL device
diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp
index e59c96ff..246cb930 100644
--- a/test/routines/level3/xtrsm.hpp
+++ b/test/routines/level3/xtrsm.hpp
@@ -29,6 +29,37 @@
 namespace clblast {
 // =================================================================================================
 
+// Prepares the data
+template <typename T>
+void PrepareData(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  const auto k = (args.side == Side::kLeft) ? args.m : args.n;
+  if (args.a_ld < k) { return; }
+
+  // Copies input buffers to the host
+  std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+  std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+  buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+  buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+
+  // Generates 'proper' input for the TRSM routine
+  // TODO: Improve this
+  for (auto i = size_t{0}; i < k; ++i) {
+    for (auto j = size_t{0}; j < k; ++j) {
+      auto value = a_mat_cpu[j*args.a_ld + i + args.a_offset];
+      value *= ConstantTwo<T>();
+      if (IsCloseToZero(value)) { value += ConstantOne<T>(); }
+      a_mat_cpu[j*args.a_ld + i + args.a_offset] = value;
+    }
+  }
+
+  // Copies input buffers back to the OpenCL device
+  buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+  buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
+  return;
+}
+
+// =================================================================================================
+
 // See comment at top of file for a description of the class
 template <typename T>
 class TestXtrsm {
@@ -75,6 +106,7 @@ class TestXtrsm {
 
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    PrepareData(args, buffers, queue);
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
@@ -89,6 +121,7 @@ class TestXtrsm {
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
     static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      PrepareData(args, buffers, queue);
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXtrsm(convertToCLBLAS(args.layout),
@@ -108,6 +141,7 @@ class TestXtrsm {
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
     static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      PrepareData(args, buffers, queue);
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
       buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp
index 4408e8d5..c6ce4b07 100644
--- a/test/routines/levelx/xinvert.hpp
+++ b/test/routines/levelx/xinvert.hpp
@@ -41,9 +41,14 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu
   const auto num_blocks = CeilDiv(args.n, block_size);
   const auto a_ld = args.a_ld;
   const auto b_ld = block_size;
-  if ((block_size == 0) || (args.n == 0) || (block_size > args.n)) {
+
+  // Checks for valid arguments
+  if ((block_size == 0) || (args.n == 0)) {
     return StatusCode::kInvalidDimension;
   }
+  if ((block_size % 16 != 0) || (block_size > 128)) {
+    return StatusCode::kUnknownError;
+  }
 
   // Loops over the amount of diagonal blocks of size args.m by args.m each
   for (auto block_id = size_t{0}; block_id < num_blocks; ++block_id) {