From e113ff0852d21ecb898b3b192145b70cad3f338a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 30 Apr 2016 09:49:39 +0200 Subject: Added non-aboslute minimum counter-part IxMIN of the BLAS routine IxAMAX --- CHANGELOG | 4 ++- README.md | 1 + include/clblast.h | 7 +++++ include/clblast_c.h | 18 ++++++++++++ include/internal/routines/level1/xmin.h | 49 +++++++++++++++++++++++++++++++++ scripts/generator/generator.py | 5 ++-- scripts/generator/routine.py | 2 +- src/clblast.cc | 33 ++++++++++++++++++++++ src/clblast_c.cc | 42 ++++++++++++++++++++++++++++ src/kernels/level1/xamax.opencl | 7 +++-- 10 files changed, 162 insertions(+), 6 deletions(-) create mode 100644 include/internal/routines/level1/xmin.h diff --git a/CHANGELOG b/CHANGELOG index 6dc1ed49..f68c2483 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -5,13 +5,15 @@ Development version (next release) - Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries - Fixed the use of events within the library - Changed the enum parameters to match the raw values of the cblas standard -- Fixed the cache of previously compiled binaries and added a function to clear it +- Fixed the cache of previously compiled binaries and added a function to fill or clear it +- Added additional sample programs - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 * SASUM/DASUM/ScASUM/DzASUM * SSUM/DSUM/ScSUM/DzSUM (non-absolute version of the above xASUM BLAS routines) * iSAMAX/iDAMAX/iCAMAX/iZAMAX * iSMAX/iDMAX/iCMAX/iZMAX (non-absolute version of the above ixAMAX BLAS routines) + * iSMIN/iDMIN/iCMIN/iZMIN (non-absolute minimum version of the above ixAMAX BLAS routines) Version 0.6.0 - Added support for MSVC (Visual Studio) 2015 diff --git a/README.md b/README.md index f2a85efc..0f7b7d3c 100644 --- a/README.md +++ b/README.md @@ -226,6 +226,7 @@ In addition, some non-BLAS routines are also supported by CLBlast. They are expe | -----------|---|---|---|---| | xSUM | ✔ | ✔ | ✔ | ✔ | | IxMAX | ✔ | ✔ | ✔ | ✔ | +| IxMIN | ✔ | ✔ | ✔ | ✔ | Some BLAS routines are not supported yet by CLBlast. They are shown in the following table: diff --git a/include/clblast.h b/include/clblast.h index 075ca93e..5df0f605 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -209,6 +209,13 @@ StatusCode Max(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +template +StatusCode Min(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index dd9b0f67..8b2bf73c 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -350,6 +350,24 @@ StatusCode PUBLIC_API CLBlastiZmax(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +StatusCode PUBLIC_API CLBlastiSmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiDmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiCmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiZmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/include/internal/routines/level1/xmin.h b/include/internal/routines/level1/xmin.h new file mode 100644 index 00000000..4c99a5ad --- /dev/null +++ b/include/internal/routines/level1/xmin.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xmin routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XMIN_H_ +#define CLBLAST_ROUTINES_XMIN_H_ + +#include "internal/routine.h" +#include "internal/routines/level1/xamax.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xmin: public Xamax { + public: + + // Members and methods from the base class + using Xamax::DoAmax; + + // Constructor + Xmin(Queue &queue, EventPointer event, const std::string &name = "MIN"): + Xamax(queue, event, name) { + } + + // Forwards to the regular max-absolute version. The implementation difference is realised in the + // kernel through a pre-processor macro based on the name of the routine. + StatusCode DoMin(const size_t n, + const Buffer &imin_buffer, const size_t imin_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XMIN_H_ +#endif diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index a9419f13..0fd05053 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -75,6 +75,7 @@ routines = [ Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)"), Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector"), Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)"), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)"), ], [ # Level 2: matrix-vector Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), @@ -298,8 +299,8 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 70, 93, 22, 29, 38] -footer_lines = [17, 70, 19, 14, 6, 6] +header_lines = [84, 71, 93, 22, 29, 38] +footer_lines = [17, 71, 19, 14, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise for f in files: diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index 2fd26e79..47790a55 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -73,7 +73,7 @@ class Routine(): # List of scalar buffers def ScalarBuffersFirst(self): - return ["dot","nrm2","asum","sum","imax"] + return ["dot","nrm2","asum","sum","imax","imin"] def ScalarBuffersSecond(self): return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"] diff --git a/src/clblast.cc b/src/clblast.cc index a5bb6b67..4d7c9986 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -32,6 +32,7 @@ #include "internal/routines/level1/xsum.h" // non-BLAS function #include "internal/routines/level1/xamax.h" #include "internal/routines/level1/xmax.h" // non-BLAS function +#include "internal/routines/level1/xmin.h" // non-BLAS function // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" @@ -525,6 +526,37 @@ template StatusCode PUBLIC_API Max(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +template +StatusCode Min(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xmin(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoMin(n, + Buffer(imin_buffer), imin_offset, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= @@ -1880,6 +1912,7 @@ StatusCode FillCache(const cl_device_id device) { Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); + Xmin(queue, nullptr).SetUp(); Xmin(queue, nullptr).SetUp(); Xmin(queue, nullptr).SetUp(); Xmin(queue, nullptr).SetUp(); // Runs all the level 2 set-up functions Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 47ab1798..1fc63de2 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -601,6 +601,48 @@ StatusCode CLBlastiZmax(const size_t n, return static_cast(status); } +// MIN +StatusCode CLBlastiSmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiDmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiCmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiZmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 58b75ce2..48d0eb5c 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -41,7 +41,7 @@ __kernel void Xamax(const int n, const int num_groups = get_num_groups(0); // Performs loading and the first steps of the reduction - #if defined(ROUTINE_MAX) // non-absolute version + #if defined(ROUTINE_MAX) || defined(ROUTINE_MIN) // non-absolute version singlereal max = SMALLEST; #else singlereal max = ZERO; @@ -55,7 +55,10 @@ __kernel void Xamax(const int n, #else singlereal x = xgm[x_index]; #endif - #if defined(ROUTINE_MAX) // non-absolute version + #if defined(ROUTINE_MAX) // non-absolute maximum version + // nothing special here + #elif defined(ROUTINE_MIN) // non-absolute minimum version + x = -x; #else x = fabs(x); #endif -- cgit v1.2.3