13 files changed, 237 insertions, 95 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 4c6a9be5..787793f0 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -9,7 +9,9 @@ Development version (next release)
 - Added level-1 routines:
   * SNRM2/DNRM2/ScNRM2/DzNRM2
   * SASUM/DASUM/ScASUM/DzASUM
+  * SSUM/DSUM/ScSUM/DzSUM (non-absolute version of the above xASUM BLAS routines)
   * iSAMAX/iDAMAX/iCAMAX/iZAMAX
+  * iSMAX/iDMAX/iCMAX/iZMAX (non-absolute version of the above ixAMAX BLAS routines)
 
 Version 0.6.0
 - Added support for MSVC (Visual Studio) 2015
diff --git a/README.md b/README.md
index b4f0981f..f2a85efc 100644
--- a/README.md
+++ b/README.md
@@ -169,64 +169,76 @@ These graphs can be generated automatically on your own device. First, compile C
 Supported routines
 -------------
 
-CLBlast is in active development but already supports almost all the BLAS routines. The currently supported routines are marked with '✔' in the following tables. Empty boxes represent routines that still need to be implemented in a future release, whereas routines marked with '-' are not part of BLAS at all.
-
-| Level-1  | S | D | C | Z | Notes   |
-| ---------|---|---|---|---|---------|
-| xROTG    |   |   | - | - |         |
-| xROTMG   |   |   | - | - |         |
-| xROT     |   |   | - | - |         |
-| xROTM    |   |   | - | - |         |
-| xSWAP    | ✔ | ✔ | ✔ | ✔ |         |
-| xSCAL    | ✔ | ✔ | ✔ | ✔ |         |
-| xCOPY    | ✔ | ✔ | ✔ | ✔ |         |
-| xAXPY    | ✔ | ✔ | ✔ | ✔ |         |
-| xDOT     | ✔ | ✔ | - | - |         |
-| xDOTU    | - | - | ✔ | ✔ |         |
-| xDOTC    | - | - | ✔ | ✔ |         |
-| xNRM2    | ✔ | ✔ | ✔ | ✔ |         |
-| xASUM    | ✔ | ✔ | ✔ | ✔ |         |
-| IxAMAX   | ✔ | ✔ | ✔ | ✔ |         |
-
-| Level-2  | S | D | C | Z | Notes   |
-| ---------|---|---|---|---|---------|
-| xGEMV    | ✔ | ✔ | ✔ | ✔ |         |
-| xGBMV    | ✔ | ✔ | ✔ | ✔ |         |
-| xHEMV    | - | - | ✔ | ✔ |         |
-| xHBMV    | - | - | ✔ | ✔ |         |
-| xHPMV    | - | - | ✔ | ✔ |         |
-| xSYMV    | ✔ | ✔ | - | - |         |
-| xSBMV    | ✔ | ✔ | - | - |         |
-| xSPMV    | ✔ | ✔ | - | - |         |
-| xTRMV    | ✔ | ✔ | ✔ | ✔ |         |
-| xTBMV    | ✔ | ✔ | ✔ | ✔ |         |
-| xTPMV    | ✔ | ✔ | ✔ | ✔ |         |
-| xTRSV    |   |   |   |   |         |
-| xTBSV    |   |   |   |   |         |
-| xTPSV    |   |   |   |   |         |
-| xGER     | ✔ | ✔ | - | - |         |
-| xGERU    | - | - | ✔ | ✔ |         |
-| xGERC    | - | - | ✔ | ✔ |         |
-| xHER     | - | - | ✔ | ✔ |         |
-| xHPR     | - | - | ✔ | ✔ |         |
-| xHER2    | - | - | ✔ | ✔ |         |
-| xHPR2    | - | - | ✔ | ✔ |         |
-| xSYR     | ✔ | ✔ | - | - |         |
-| xSPR     | ✔ | ✔ | - | - |         |
-| xSYR2    | ✔ | ✔ | - | - |         |
-| xSPR2    | ✔ | ✔ | - | - |         |
-
-| Level-3  | S | D | C | Z | Notes   |
-| ---------|---|---|---|---|---------|
-| xGEMM    | ✔ | ✔ | ✔ | ✔ |         |
-| xSYMM    | ✔ | ✔ | ✔ | ✔ |         |
-| xHEMM    | - | - | ✔ | ✔ |         |
-| xSYRK    | ✔ | ✔ | ✔ | ✔ |         |
-| xHERK    | - | - | ✔ | ✔ |         |
-| xSYR2K   | ✔ | ✔ | ✔ | ✔ |         |
-| xHER2K   | - | - | ✔ | ✔ |         |
-| xTRMM    | ✔ | ✔ | ✔ | ✔ |         |
-| xTRSM    |   |   |   |   |         |
+CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all.
+
+| Level-1  | S | D | C | Z |
+| ---------|---|---|---|---|
+| xSWAP    | ✔ | ✔ | ✔ | ✔ |
+| xSCAL    | ✔ | ✔ | ✔ | ✔ |
+| xCOPY    | ✔ | ✔ | ✔ | ✔ |
+| xAXPY    | ✔ | ✔ | ✔ | ✔ |
+| xDOT     | ✔ | ✔ | - | - |
+| xDOTU    | - | - | ✔ | ✔ |
+| xDOTC    | - | - | ✔ | ✔ |
+| xNRM2    | ✔ | ✔ | ✔ | ✔ |
+| xASUM    | ✔ | ✔ | ✔ | ✔ |
+| IxAMAX   | ✔ | ✔ | ✔ | ✔ |
+
+| Level-2  | S | D | C | Z |
+| ---------|---|---|---|---|
+| xGEMV    | ✔ | ✔ | ✔ | ✔ |
+| xGBMV    | ✔ | ✔ | ✔ | ✔ |
+| xHEMV    | - | - | ✔ | ✔ |
+| xHBMV    | - | - | ✔ | ✔ |
+| xHPMV    | - | - | ✔ | ✔ |
+| xSYMV    | ✔ | ✔ | - | - |
+| xSBMV    | ✔ | ✔ | - | - |
+| xSPMV    | ✔ | ✔ | - | - |
+| xTRMV    | ✔ | ✔ | ✔ | ✔ |
+| xTBMV    | ✔ | ✔ | ✔ | ✔ |
+| xTPMV    | ✔ | ✔ | ✔ | ✔ |
+| xGER     | ✔ | ✔ | - | - |
+| xGERU    | - | - | ✔ | ✔ |
+| xGERC    | - | - | ✔ | ✔ |
+| xHER     | - | - | ✔ | ✔ |
+| xHPR     | - | - | ✔ | ✔ |
+| xHER2    | - | - | ✔ | ✔ |
+| xHPR2    | - | - | ✔ | ✔ |
+| xSYR     | ✔ | ✔ | - | - |
+| xSPR     | ✔ | ✔ | - | - |
+| xSYR2    | ✔ | ✔ | - | - |
+| xSPR2    | ✔ | ✔ | - | - |
+
+| Level-3  | S | D | C | Z |
+| ---------|---|---|---|---|
+| xGEMM    | ✔ | ✔ | ✔ | ✔ |
+| xSYMM    | ✔ | ✔ | ✔ | ✔ |
+| xHEMM    | - | - | ✔ | ✔ |
+| xSYRK    | ✔ | ✔ | ✔ | ✔ |
+| xHERK    | - | - | ✔ | ✔ |
+| xSYR2K   | ✔ | ✔ | ✔ | ✔ |
+| xHER2K   | - | - | ✔ | ✔ |
+| xTRMM    | ✔ | ✔ | ✔ | ✔ |
+
+In addition, some non-BLAS routines are also supported by CLBlast. They are experimental and should be used with care:
+
+| Additional | S | D | C | Z |
+| -----------|---|---|---|---|
+| xSUM       | ✔ | ✔ | ✔ | ✔ |
+| IxMAX      | ✔ | ✔ | ✔ | ✔ |
+
+Some BLAS routines are not supported yet by CLBlast. They are shown in the following table:
+
+| Unsupported | S | D | C | Z |
+| ------------|---|---|---|---|
+| xROTG       |   |   | - | - |
+| xROTMG      |   |   | - | - |
+| xROT        |   |   | - | - |
+| xROTM       |   |   | - | - |
+| xTRSV       |   |   |   |   |
+| xTBSV       |   |   |   |   |
+| xTPSV       |   |   |   |   |
+| xTRSM       |   |   |   |   |
 
 
 Contributing
diff --git a/include/clblast.h b/include/clblast.h
index f3b74f6e..57fca119 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -188,10 +188,10 @@ StatusCode Asum(const size_t n,
                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
 template <typename T>
 StatusCode Sum(const size_t n,
-               cl_mem asum_buffer, const size_t asum_offset,
+               cl_mem sum_buffer, const size_t sum_offset,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                cl_command_queue* queue, cl_event* event = nullptr);
 
@@ -202,7 +202,7 @@ StatusCode Amax(const size_t n,
                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
 template <typename T>
 StatusCode Max(const size_t n,
                cl_mem imax_buffer, const size_t imax_offset,
diff --git a/include/clblast_c.h b/include/clblast_c.h
index 2f692b66..e23f0305 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -296,21 +296,21 @@ StatusCode PUBLIC_API CLBlastDzasum(const size_t n,
                                    const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_command_queue* queue, cl_event* event);
 
-// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
 StatusCode PUBLIC_API CLBlastSsum(const size_t n,
-                                  cl_mem asum_buffer, const size_t asum_offset,
+                                  cl_mem sum_buffer, const size_t sum_offset,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_command_queue* queue, cl_event* event);
 StatusCode PUBLIC_API CLBlastDsum(const size_t n,
-                                  cl_mem asum_buffer, const size_t asum_offset,
+                                  cl_mem sum_buffer, const size_t sum_offset,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_command_queue* queue, cl_event* event);
 StatusCode PUBLIC_API CLBlastScsum(const size_t n,
-                                  cl_mem asum_buffer, const size_t asum_offset,
+                                  cl_mem sum_buffer, const size_t sum_offset,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_command_queue* queue, cl_event* event);
 StatusCode PUBLIC_API CLBlastDzsum(const size_t n,
-                                  cl_mem asum_buffer, const size_t asum_offset,
+                                  cl_mem sum_buffer, const size_t sum_offset,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_command_queue* queue, cl_event* event);
 
@@ -332,7 +332,7 @@ StatusCode PUBLIC_API CLBlastiZamax(const size_t n,
                                    const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_command_queue* queue, cl_event* event);
 
-// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
 StatusCode PUBLIC_API CLBlastiSmax(const size_t n,
                                   cl_mem imax_buffer, const size_t imax_offset,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
diff --git a/include/internal/routines/level1/xmax.h b/include/internal/routines/level1/xmax.h
new file mode 100644
index 00000000..860a043b
--- /dev/null
+++ b/include/internal/routines/level1/xmax.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xmax routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XMAX_H_
+#define CLBLAST_ROUTINES_XMAX_H_
+
+#include "internal/routine.h"
+#include "internal/routines/level1/xamax.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xmax: public Xamax<T> {
+ public:
+
+  // Members and methods from the base class
+  using Xamax<T>::DoAmax;
+
+  // Constructor
+  Xmax(Queue &queue, EventPointer event, const std::string &name = "MAX"):
+    Xamax<T>(queue, event, name) {
+  }
+
+  // Forwards to the regular absolute version. The implementation difference is realised in the
+  // kernel through a pre-processor macro based on the name of the routine.
+  StatusCode DoMax(const size_t n,
+                   const Buffer<T> &imax_buffer, const size_t imax_offset,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XMAX_H_
+#endif
diff --git a/include/internal/routines/level1/xsum.h b/include/internal/routines/level1/xsum.h
new file mode 100644
index 00000000..2f633b52
--- /dev/null
+++ b/include/internal/routines/level1/xsum.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsum routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSUM_H_
+#define CLBLAST_ROUTINES_XSUM_H_
+
+#include "internal/routine.h"
+#include "internal/routines/level1/xasum.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsum: public Xasum<T> {
+ public:
+
+  // Members and methods from the base class
+  using Xasum<T>::DoAsum;
+
+  // Constructor
+  Xsum(Queue &queue, EventPointer event, const std::string &name = "SUM"):
+    Xasum<T>(queue, event, name) {
+  }
+
+  // Forwards to the regular absolute version. The implementation difference is realised in the
+  // kernel through a pre-processor macro based on the name of the routine.
+  StatusCode DoSum(const size_t n,
+                   const Buffer<T> &sum_buffer, const size_t sum_offset,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSUM_H_
+#endif
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index cad9a82d..04f3c30e 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -72,9 +72,9 @@ routines = [
   Routine(True,  True,  "1", "dotc",  T,  [C,Z],     ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"),
   Routine(True,  True,  "1", "nrm2",  T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"),
   Routine(True,  True,  "1", "asum",  T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"),
-  Routine(False, False, "1", "sum",   T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Sum of values in a vector"),
+  Routine(True,  False, "1", "sum",   T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)"),
   Routine(True,  True,  "1", "amax",  T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector"),
-  Routine(False, False, "1", "max",   T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector"),
+  Routine(True,  False, "1", "max",   T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)"),
 ],
 [ # Level 2: matrix-vector
   Routine(True,  True,  "2a", "gemv",  T,  [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"),
@@ -298,7 +298,7 @@ files = [
   path_clblast+"/test/wrapper_clblas.h",
   path_clblast+"/test/wrapper_cblas.h",
 ]
-header_lines = [84, 68, 93, 22, 29, 38]
+header_lines = [84, 70, 93, 22, 29, 38]
 footer_lines = [13, 8, 15, 9, 6, 6]
 
 # Checks whether the command-line arguments are valid; exists otherwise
diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py
index b46c3716..2fd26e79 100644
--- a/scripts/generator/routine.py
+++ b/scripts/generator/routine.py
@@ -73,7 +73,7 @@ class Routine():
 
 	# List of scalar buffers
 	def ScalarBuffersFirst(self):
-		return ["dot","nrm2","asum","imax"]
+		return ["dot","nrm2","asum","sum","imax"]
 	def ScalarBuffersSecond(self):
 		return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"]
 
diff --git a/src/clblast.cc b/src/clblast.cc
index 4f2e6fb5..fac5a539 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -29,7 +29,9 @@
 #include "internal/routines/level1/xdotc.h"
 #include "internal/routines/level1/xnrm2.h"
 #include "internal/routines/level1/xasum.h"
+#include "internal/routines/level1/xsum.h" // non-BLAS function
 #include "internal/routines/level1/xamax.h"
+#include "internal/routines/level1/xmax.h" // non-BLAS function
 
 // BLAS level-2 includes
 #include "internal/routines/level2/xgemv.h"
@@ -430,13 +432,19 @@ template StatusCode PUBLIC_API Asum<double2>(const size_t,
                                              const cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
 
-// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
 template <typename T>
-StatusCode Sum(const size_t,
-               cl_mem, const size_t,
-               const cl_mem, const size_t, const size_t,
-               cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Sum(const size_t n,
+               cl_mem sum_buffer, const size_t sum_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto routine = Xsum<T>(queue_cpp, event);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoSum(n,
+                       Buffer<T>(sum_buffer), sum_offset,
+                       Buffer<T>(x_buffer), x_offset, x_inc);
 }
 template StatusCode PUBLIC_API Sum<float>(const size_t,
                                           cl_mem, const size_t,
@@ -486,13 +494,19 @@ template StatusCode PUBLIC_API Amax<double2>(const size_t,
                                              const cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
 
-// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
 template <typename T>
-StatusCode Max(const size_t,
-               cl_mem, const size_t,
-               const cl_mem, const size_t, const size_t,
-               cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Max(const size_t n,
+               cl_mem imax_buffer, const size_t imax_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto routine = Xmax<T>(queue_cpp, event);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoMax(n,
+                       Buffer<T>(imax_buffer), imax_offset,
+                       Buffer<T>(x_buffer), x_offset, x_inc);
 }
 template StatusCode PUBLIC_API Max<float>(const size_t,
                                           cl_mem, const size_t,
diff --git a/src/clblast_c.cc b/src/clblast_c.cc
index e6270d57..72d93c4b 100644
--- a/src/clblast_c.cc
+++ b/src/clblast_c.cc
@@ -477,41 +477,41 @@ StatusCode CLBlastDzasum(const size_t n,
 
 // SUM
 StatusCode CLBlastSsum(const size_t n,
-                       cl_mem asum_buffer, const size_t asum_offset,
+                       cl_mem sum_buffer, const size_t sum_offset,
                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                        cl_command_queue* queue, cl_event* event) {
   auto status = clblast::Sum<float>(n,
-                                    asum_buffer, asum_offset,
+                                    sum_buffer, sum_offset,
                                     x_buffer, x_offset, x_inc,
                                     queue, event);
   return static_cast<StatusCode>(status);
 }
 StatusCode CLBlastDsum(const size_t n,
-                       cl_mem asum_buffer, const size_t asum_offset,
+                       cl_mem sum_buffer, const size_t sum_offset,
                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                        cl_command_queue* queue, cl_event* event) {
   auto status = clblast::Sum<double>(n,
-                                     asum_buffer, asum_offset,
+                                     sum_buffer, sum_offset,
                                      x_buffer, x_offset, x_inc,
                                      queue, event);
   return static_cast<StatusCode>(status);
 }
 StatusCode CLBlastScsum(const size_t n,
-                       cl_mem asum_buffer, const size_t asum_offset,
+                       cl_mem sum_buffer, const size_t sum_offset,
                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                        cl_command_queue* queue, cl_event* event) {
   auto status = clblast::Sum<float2>(n,
-                                     asum_buffer, asum_offset,
+                                     sum_buffer, sum_offset,
                                      x_buffer, x_offset, x_inc,
                                      queue, event);
   return static_cast<StatusCode>(status);
 }
 StatusCode CLBlastDzsum(const size_t n,
-                       cl_mem asum_buffer, const size_t asum_offset,
+                       cl_mem sum_buffer, const size_t sum_offset,
                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                        cl_command_queue* queue, cl_event* event) {
   auto status = clblast::Sum<double2>(n,
-                                      asum_buffer, asum_offset,
+                                      sum_buffer, sum_offset,
                                       x_buffer, x_offset, x_inc,
                                       queue, event);
   return static_cast<StatusCode>(status);
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 57d75ee0..d401744d 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -40,6 +40,7 @@ R"(
   typedef float16 real16;
   #define ZERO 0.0f
   #define ONE 1.0f
+  #define SMALLEST -1.0e37f
 
 // Double-precision 
 #elif PRECISION == 64
@@ -50,6 +51,7 @@ R"(
   typedef double16 real16;
   #define ZERO 0.0
   #define ONE 1.0
+  #define SMALLEST -1.0e37
 
 // Complex single-precision
 #elif PRECISION == 3232
@@ -64,6 +66,7 @@ R"(
                            real sC; real sD; real sE; real sF;} real16;
   #define ZERO 0.0f
   #define ONE 1.0f
+  #define SMALLEST -1.0e37f
 
 // Complex Double-precision
 #elif PRECISION == 6464
@@ -78,6 +81,7 @@ R"(
                             real sC; real sD; real sE; real sF;} real16;
   #define ZERO 0.0
   #define ONE 1.0
+  #define SMALLEST -1.0e37
 #endif
 
 // Single-element version of a complex number
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 03dd05e5..58b75ce2 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -41,14 +41,23 @@ __kernel void Xamax(const int n,
   const int num_groups = get_num_groups(0);
 
   // Performs loading and the first steps of the reduction
-  singlereal max = ZERO;
+  #if defined(ROUTINE_MAX) // non-absolute version
+    singlereal max = SMALLEST;
+  #else
+    singlereal max = ZERO;
+  #endif
   unsigned int imax = 0;
   int id = wgid*WGS1 + lid;
   while (id < n) {
+    const int x_index = id*x_inc + x_offset;
     #if PRECISION == 3232 || PRECISION == 6464
-      singlereal x = fabs(xgm[id*x_inc + x_offset].x);
+      singlereal x = xgm[x_index].x;
     #else
-      singlereal x = fabs(xgm[id*x_inc + x_offset]);
+      singlereal x = xgm[x_index];
+    #endif
+    #if defined(ROUTINE_MAX) // non-absolute version
+    #else
+      x = fabs(x);
     #endif
     if (x >= max) {
       max = x;
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 037dc57e..58d0f11b 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -45,7 +45,10 @@ __kernel void Xasum(const int n,
   int id = wgid*WGS1 + lid;
   while (id < n) {
     real x = xgm[id*x_inc + x_offset];
-    AbsoluteValue(x);
+    #if defined(ROUTINE_SUM) // non-absolute version
+    #else
+      AbsoluteValue(x);
+    #endif
     Add(acc, acc, x);
     id += WGS1*num_groups;
   }