diff options
-rw-r--r-- | CHANGELOG | 2 | ||||
-rw-r--r-- | README.md | 128 | ||||
-rw-r--r-- | include/clblast.h | 6 | ||||
-rw-r--r-- | include/clblast_c.h | 12 | ||||
-rw-r--r-- | include/internal/routines/level1/xmax.h | 49 | ||||
-rw-r--r-- | include/internal/routines/level1/xsum.h | 49 | ||||
-rw-r--r-- | scripts/generator/generator.py | 6 | ||||
-rw-r--r-- | scripts/generator/routine.py | 2 | ||||
-rw-r--r-- | src/clblast.cc | 38 | ||||
-rw-r--r-- | src/clblast_c.cc | 16 | ||||
-rw-r--r-- | src/kernels/common.opencl | 4 | ||||
-rw-r--r-- | src/kernels/level1/xamax.opencl | 15 | ||||
-rw-r--r-- | src/kernels/level1/xasum.opencl | 5 |
13 files changed, 237 insertions, 95 deletions
@@ -9,7 +9,9 @@ Development version (next release) - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 * SASUM/DASUM/ScASUM/DzASUM + * SSUM/DSUM/ScSUM/DzSUM (non-absolute version of the above xASUM BLAS routines) * iSAMAX/iDAMAX/iCAMAX/iZAMAX + * iSMAX/iDMAX/iCMAX/iZMAX (non-absolute version of the above ixAMAX BLAS routines) Version 0.6.0 - Added support for MSVC (Visual Studio) 2015 @@ -169,64 +169,76 @@ These graphs can be generated automatically on your own device. First, compile C Supported routines ------------- -CLBlast is in active development but already supports almost all the BLAS routines. The currently supported routines are marked with '✔' in the following tables. Empty boxes represent routines that still need to be implemented in a future release, whereas routines marked with '-' are not part of BLAS at all. - -| Level-1 | S | D | C | Z | Notes | -| ---------|---|---|---|---|---------| -| xROTG | | | - | - | | -| xROTMG | | | - | - | | -| xROT | | | - | - | | -| xROTM | | | - | - | | -| xSWAP | ✔ | ✔ | ✔ | ✔ | | -| xSCAL | ✔ | ✔ | ✔ | ✔ | | -| xCOPY | ✔ | ✔ | ✔ | ✔ | | -| xAXPY | ✔ | ✔ | ✔ | ✔ | | -| xDOT | ✔ | ✔ | - | - | | -| xDOTU | - | - | ✔ | ✔ | | -| xDOTC | - | - | ✔ | ✔ | | -| xNRM2 | ✔ | ✔ | ✔ | ✔ | | -| xASUM | ✔ | ✔ | ✔ | ✔ | | -| IxAMAX | ✔ | ✔ | ✔ | ✔ | | - -| Level-2 | S | D | C | Z | Notes | -| ---------|---|---|---|---|---------| -| xGEMV | ✔ | ✔ | ✔ | ✔ | | -| xGBMV | ✔ | ✔ | ✔ | ✔ | | -| xHEMV | - | - | ✔ | ✔ | | -| xHBMV | - | - | ✔ | ✔ | | -| xHPMV | - | - | ✔ | ✔ | | -| xSYMV | ✔ | ✔ | - | - | | -| xSBMV | ✔ | ✔ | - | - | | -| xSPMV | ✔ | ✔ | - | - | | -| xTRMV | ✔ | ✔ | ✔ | ✔ | | -| xTBMV | ✔ | ✔ | ✔ | ✔ | | -| xTPMV | ✔ | ✔ | ✔ | ✔ | | -| xTRSV | | | | | | -| xTBSV | | | | | | -| xTPSV | | | | | | -| xGER | ✔ | ✔ | - | - | | -| xGERU | - | - | ✔ | ✔ | | -| xGERC | - | - | ✔ | ✔ | | -| xHER | - | - | ✔ | ✔ | | -| xHPR | - | - | ✔ | ✔ | | -| xHER2 | - | - | ✔ | ✔ | | -| xHPR2 | - | - | ✔ | ✔ | | -| xSYR | ✔ | ✔ | - | - | | -| xSPR | ✔ | ✔ | - | - | | -| xSYR2 | ✔ | ✔ | - | - | | -| xSPR2 | ✔ | ✔ | - | - | | - -| Level-3 | S | D | C | Z | Notes | -| ---------|---|---|---|---|---------| -| xGEMM | ✔ | ✔ | ✔ | ✔ | | -| xSYMM | ✔ | ✔ | ✔ | ✔ | | -| xHEMM | - | - | ✔ | ✔ | | -| xSYRK | ✔ | ✔ | ✔ | ✔ | | -| xHERK | - | - | ✔ | ✔ | | -| xSYR2K | ✔ | ✔ | ✔ | ✔ | | -| xHER2K | - | - | ✔ | ✔ | | -| xTRMM | ✔ | ✔ | ✔ | ✔ | | -| xTRSM | | | | | | +CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all. + +| Level-1 | S | D | C | Z | +| ---------|---|---|---|---| +| xSWAP | ✔ | ✔ | ✔ | ✔ | +| xSCAL | ✔ | ✔ | ✔ | ✔ | +| xCOPY | ✔ | ✔ | ✔ | ✔ | +| xAXPY | ✔ | ✔ | ✔ | ✔ | +| xDOT | ✔ | ✔ | - | - | +| xDOTU | - | - | ✔ | ✔ | +| xDOTC | - | - | ✔ | ✔ | +| xNRM2 | ✔ | ✔ | ✔ | ✔ | +| xASUM | ✔ | ✔ | ✔ | ✔ | +| IxAMAX | ✔ | ✔ | ✔ | ✔ | + +| Level-2 | S | D | C | Z | +| ---------|---|---|---|---| +| xGEMV | ✔ | ✔ | ✔ | ✔ | +| xGBMV | ✔ | ✔ | ✔ | ✔ | +| xHEMV | - | - | ✔ | ✔ | +| xHBMV | - | - | ✔ | ✔ | +| xHPMV | - | - | ✔ | ✔ | +| xSYMV | ✔ | ✔ | - | - | +| xSBMV | ✔ | ✔ | - | - | +| xSPMV | ✔ | ✔ | - | - | +| xTRMV | ✔ | ✔ | ✔ | ✔ | +| xTBMV | ✔ | ✔ | ✔ | ✔ | +| xTPMV | ✔ | ✔ | ✔ | ✔ | +| xGER | ✔ | ✔ | - | - | +| xGERU | - | - | ✔ | ✔ | +| xGERC | - | - | ✔ | ✔ | +| xHER | - | - | ✔ | ✔ | +| xHPR | - | - | ✔ | ✔ | +| xHER2 | - | - | ✔ | ✔ | +| xHPR2 | - | - | ✔ | ✔ | +| xSYR | ✔ | ✔ | - | - | +| xSPR | ✔ | ✔ | - | - | +| xSYR2 | ✔ | ✔ | - | - | +| xSPR2 | ✔ | ✔ | - | - | + +| Level-3 | S | D | C | Z | +| ---------|---|---|---|---| +| xGEMM | ✔ | ✔ | ✔ | ✔ | +| xSYMM | ✔ | ✔ | ✔ | ✔ | +| xHEMM | - | - | ✔ | ✔ | +| xSYRK | ✔ | ✔ | ✔ | ✔ | +| xHERK | - | - | ✔ | ✔ | +| xSYR2K | ✔ | ✔ | ✔ | ✔ | +| xHER2K | - | - | ✔ | ✔ | +| xTRMM | ✔ | ✔ | ✔ | ✔ | + +In addition, some non-BLAS routines are also supported by CLBlast. They are experimental and should be used with care: + +| Additional | S | D | C | Z | +| -----------|---|---|---|---| +| xSUM | ✔ | ✔ | ✔ | ✔ | +| IxMAX | ✔ | ✔ | ✔ | ✔ | + +Some BLAS routines are not supported yet by CLBlast. They are shown in the following table: + +| Unsupported | S | D | C | Z | +| ------------|---|---|---|---| +| xROTG | | | - | - | +| xROTMG | | | - | - | +| xROT | | | - | - | +| xROTM | | | - | - | +| xTRSV | | | | | +| xTBSV | | | | | +| xTPSV | | | | | +| xTRSM | | | | | Contributing diff --git a/include/clblast.h b/include/clblast.h index f3b74f6e..57fca119 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -188,10 +188,10 @@ StatusCode Asum(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM template <typename T> StatusCode Sum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); @@ -202,7 +202,7 @@ StatusCode Amax(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX template <typename T> StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, diff --git a/include/clblast_c.h b/include/clblast_c.h index 2f692b66..e23f0305 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -296,21 +296,21 @@ StatusCode PUBLIC_API CLBlastDzasum(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); -// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM StatusCode PUBLIC_API CLBlastSsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); StatusCode PUBLIC_API CLBlastDsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); StatusCode PUBLIC_API CLBlastScsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); StatusCode PUBLIC_API CLBlastDzsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); @@ -332,7 +332,7 @@ StatusCode PUBLIC_API CLBlastiZamax(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); -// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX StatusCode PUBLIC_API CLBlastiSmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, diff --git a/include/internal/routines/level1/xmax.h b/include/internal/routines/level1/xmax.h new file mode 100644 index 00000000..860a043b --- /dev/null +++ b/include/internal/routines/level1/xmax.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xmax routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XMAX_H_ +#define CLBLAST_ROUTINES_XMAX_H_ + +#include "internal/routine.h" +#include "internal/routines/level1/xamax.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xmax: public Xamax<T> { + public: + + // Members and methods from the base class + using Xamax<T>::DoAmax; + + // Constructor + Xmax(Queue &queue, EventPointer event, const std::string &name = "MAX"): + Xamax<T>(queue, event, name) { + } + + // Forwards to the regular absolute version. The implementation difference is realised in the + // kernel through a pre-processor macro based on the name of the routine. + StatusCode DoMax(const size_t n, + const Buffer<T> &imax_buffer, const size_t imax_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XMAX_H_ +#endif diff --git a/include/internal/routines/level1/xsum.h b/include/internal/routines/level1/xsum.h new file mode 100644 index 00000000..2f633b52 --- /dev/null +++ b/include/internal/routines/level1/xsum.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xsum routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSUM_H_ +#define CLBLAST_ROUTINES_XSUM_H_ + +#include "internal/routine.h" +#include "internal/routines/level1/xasum.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xsum: public Xasum<T> { + public: + + // Members and methods from the base class + using Xasum<T>::DoAsum; + + // Constructor + Xsum(Queue &queue, EventPointer event, const std::string &name = "SUM"): + Xasum<T>(queue, event, name) { + } + + // Forwards to the regular absolute version. The implementation difference is realised in the + // kernel through a pre-processor macro based on the name of the routine. + StatusCode DoSum(const size_t n, + const Buffer<T> &sum_buffer, const size_t sum_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSUM_H_ +#endif diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index cad9a82d..04f3c30e 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -72,9 +72,9 @@ routines = [ Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"), Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"), Routine(True, True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"), - Routine(False, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Sum of values in a vector"), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)"), Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector"), - Routine(False, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector"), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)"), ], [ # Level 2: matrix-vector Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), @@ -298,7 +298,7 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 68, 93, 22, 29, 38] +header_lines = [84, 70, 93, 22, 29, 38] footer_lines = [13, 8, 15, 9, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index b46c3716..2fd26e79 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -73,7 +73,7 @@ class Routine(): # List of scalar buffers def ScalarBuffersFirst(self): - return ["dot","nrm2","asum","imax"] + return ["dot","nrm2","asum","sum","imax"] def ScalarBuffersSecond(self): return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"] diff --git a/src/clblast.cc b/src/clblast.cc index 4f2e6fb5..fac5a539 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -29,7 +29,9 @@ #include "internal/routines/level1/xdotc.h" #include "internal/routines/level1/xnrm2.h" #include "internal/routines/level1/xasum.h" +#include "internal/routines/level1/xsum.h" // non-BLAS function #include "internal/routines/level1/xamax.h" +#include "internal/routines/level1/xmax.h" // non-BLAS function // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" @@ -430,13 +432,19 @@ template StatusCode PUBLIC_API Asum<double2>(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM template <typename T> -StatusCode Sum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Sum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xsum<T>(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSum(n, + Buffer<T>(sum_buffer), sum_offset, + Buffer<T>(x_buffer), x_offset, x_inc); } template StatusCode PUBLIC_API Sum<float>(const size_t, cl_mem, const size_t, @@ -486,13 +494,19 @@ template StatusCode PUBLIC_API Amax<double2>(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX template <typename T> -StatusCode Max(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Max(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xmax<T>(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoMax(n, + Buffer<T>(imax_buffer), imax_offset, + Buffer<T>(x_buffer), x_offset, x_inc); } template StatusCode PUBLIC_API Max<float>(const size_t, cl_mem, const size_t, diff --git a/src/clblast_c.cc b/src/clblast_c.cc index e6270d57..72d93c4b 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -477,41 +477,41 @@ StatusCode CLBlastDzasum(const size_t n, // SUM StatusCode CLBlastSsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Sum<float>(n, - asum_buffer, asum_offset, + sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event); return static_cast<StatusCode>(status); } StatusCode CLBlastDsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Sum<double>(n, - asum_buffer, asum_offset, + sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event); return static_cast<StatusCode>(status); } StatusCode CLBlastScsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Sum<float2>(n, - asum_buffer, asum_offset, + sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event); return static_cast<StatusCode>(status); } StatusCode CLBlastDzsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Sum<double2>(n, - asum_buffer, asum_offset, + sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event); return static_cast<StatusCode>(status); diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 57d75ee0..d401744d 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -40,6 +40,7 @@ R"( typedef float16 real16; #define ZERO 0.0f #define ONE 1.0f + #define SMALLEST -1.0e37f // Double-precision #elif PRECISION == 64 @@ -50,6 +51,7 @@ R"( typedef double16 real16; #define ZERO 0.0 #define ONE 1.0 + #define SMALLEST -1.0e37 // Complex single-precision #elif PRECISION == 3232 @@ -64,6 +66,7 @@ R"( real sC; real sD; real sE; real sF;} real16; #define ZERO 0.0f #define ONE 1.0f + #define SMALLEST -1.0e37f // Complex Double-precision #elif PRECISION == 6464 @@ -78,6 +81,7 @@ R"( real sC; real sD; real sE; real sF;} real16; #define ZERO 0.0 #define ONE 1.0 + #define SMALLEST -1.0e37 #endif // Single-element version of a complex number diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 03dd05e5..58b75ce2 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -41,14 +41,23 @@ __kernel void Xamax(const int n, const int num_groups = get_num_groups(0); // Performs loading and the first steps of the reduction - singlereal max = ZERO; + #if defined(ROUTINE_MAX) // non-absolute version + singlereal max = SMALLEST; + #else + singlereal max = ZERO; + #endif unsigned int imax = 0; int id = wgid*WGS1 + lid; while (id < n) { + const int x_index = id*x_inc + x_offset; #if PRECISION == 3232 || PRECISION == 6464 - singlereal x = fabs(xgm[id*x_inc + x_offset].x); + singlereal x = xgm[x_index].x; #else - singlereal x = fabs(xgm[id*x_inc + x_offset]); + singlereal x = xgm[x_index]; + #endif + #if defined(ROUTINE_MAX) // non-absolute version + #else + x = fabs(x); #endif if (x >= max) { max = x; diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl index 037dc57e..58d0f11b 100644 --- a/src/kernels/level1/xasum.opencl +++ b/src/kernels/level1/xasum.opencl @@ -45,7 +45,10 @@ __kernel void Xasum(const int n, int id = wgid*WGS1 + lid; while (id < n) { real x = xgm[id*x_inc + x_offset]; - AbsoluteValue(x); + #if defined(ROUTINE_SUM) // non-absolute version + #else + AbsoluteValue(x); + #endif Add(acc, acc, x); id += WGS1*num_groups; } |