summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-04-30 09:49:39 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-04-30 09:49:39 +0200
commite113ff0852d21ecb898b3b192145b70cad3f338a (patch)
treed380ab9a413394f4d3901c131d0c8a02ab80e025
parent2952390f27c07500bd2a24b5e6fdce5e282fc8dd (diff)
Added non-aboslute minimum counter-part IxMIN of the BLAS routine IxAMAX
-rw-r--r--CHANGELOG4
-rw-r--r--README.md1
-rw-r--r--include/clblast.h7
-rw-r--r--include/clblast_c.h18
-rw-r--r--include/internal/routines/level1/xmin.h49
-rw-r--r--scripts/generator/generator.py5
-rw-r--r--scripts/generator/routine.py2
-rw-r--r--src/clblast.cc33
-rw-r--r--src/clblast_c.cc42
-rw-r--r--src/kernels/level1/xamax.opencl7
10 files changed, 162 insertions, 6 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 6dc1ed49..f68c2483 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -5,13 +5,15 @@ Development version (next release)
- Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries
- Fixed the use of events within the library
- Changed the enum parameters to match the raw values of the cblas standard
-- Fixed the cache of previously compiled binaries and added a function to clear it
+- Fixed the cache of previously compiled binaries and added a function to fill or clear it
+- Added additional sample programs
- Added level-1 routines:
* SNRM2/DNRM2/ScNRM2/DzNRM2
* SASUM/DASUM/ScASUM/DzASUM
* SSUM/DSUM/ScSUM/DzSUM (non-absolute version of the above xASUM BLAS routines)
* iSAMAX/iDAMAX/iCAMAX/iZAMAX
* iSMAX/iDMAX/iCMAX/iZMAX (non-absolute version of the above ixAMAX BLAS routines)
+ * iSMIN/iDMIN/iCMIN/iZMIN (non-absolute minimum version of the above ixAMAX BLAS routines)
Version 0.6.0
- Added support for MSVC (Visual Studio) 2015
diff --git a/README.md b/README.md
index f2a85efc..0f7b7d3c 100644
--- a/README.md
+++ b/README.md
@@ -226,6 +226,7 @@ In addition, some non-BLAS routines are also supported by CLBlast. They are expe
| -----------|---|---|---|---|
| xSUM | ✔ | ✔ | ✔ | ✔ |
| IxMAX | ✔ | ✔ | ✔ | ✔ |
+| IxMIN | ✔ | ✔ | ✔ | ✔ |
Some BLAS routines are not supported yet by CLBlast. They are shown in the following table:
diff --git a/include/clblast.h b/include/clblast.h
index 075ca93e..5df0f605 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -209,6 +209,13 @@ StatusCode Max(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
+template <typename T>
+StatusCode Min(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event = nullptr);
+
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// =================================================================================================
diff --git a/include/clblast_c.h b/include/clblast_c.h
index dd9b0f67..8b2bf73c 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -350,6 +350,24 @@ StatusCode PUBLIC_API CLBlastiZmax(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
+StatusCode PUBLIC_API CLBlastiSmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastiDmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastiCmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastiZmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event);
+
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// =================================================================================================
diff --git a/include/internal/routines/level1/xmin.h b/include/internal/routines/level1/xmin.h
new file mode 100644
index 00000000..4c99a5ad
--- /dev/null
+++ b/include/internal/routines/level1/xmin.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xmin routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XMIN_H_
+#define CLBLAST_ROUTINES_XMIN_H_
+
+#include "internal/routine.h"
+#include "internal/routines/level1/xamax.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xmin: public Xamax<T> {
+ public:
+
+ // Members and methods from the base class
+ using Xamax<T>::DoAmax;
+
+ // Constructor
+ Xmin(Queue &queue, EventPointer event, const std::string &name = "MIN"):
+ Xamax<T>(queue, event, name) {
+ }
+
+ // Forwards to the regular max-absolute version. The implementation difference is realised in the
+ // kernel through a pre-processor macro based on the name of the routine.
+ StatusCode DoMin(const size_t n,
+ const Buffer<T> &imin_buffer, const size_t imin_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XMIN_H_
+#endif
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index a9419f13..0fd05053 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -75,6 +75,7 @@ routines = [
Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)"),
Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector"),
Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)"),
+ Routine(True, False, "1", "min", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)"),
],
[ # Level 2: matrix-vector
Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"),
@@ -298,8 +299,8 @@ files = [
path_clblast+"/test/wrapper_clblas.h",
path_clblast+"/test/wrapper_cblas.h",
]
-header_lines = [84, 70, 93, 22, 29, 38]
-footer_lines = [17, 70, 19, 14, 6, 6]
+header_lines = [84, 71, 93, 22, 29, 38]
+footer_lines = [17, 71, 19, 14, 6, 6]
# Checks whether the command-line arguments are valid; exists otherwise
for f in files:
diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py
index 2fd26e79..47790a55 100644
--- a/scripts/generator/routine.py
+++ b/scripts/generator/routine.py
@@ -73,7 +73,7 @@ class Routine():
# List of scalar buffers
def ScalarBuffersFirst(self):
- return ["dot","nrm2","asum","sum","imax"]
+ return ["dot","nrm2","asum","sum","imax","imin"]
def ScalarBuffersSecond(self):
return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"]
diff --git a/src/clblast.cc b/src/clblast.cc
index a5bb6b67..4d7c9986 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -32,6 +32,7 @@
#include "internal/routines/level1/xsum.h" // non-BLAS function
#include "internal/routines/level1/xamax.h"
#include "internal/routines/level1/xmax.h" // non-BLAS function
+#include "internal/routines/level1/xmin.h" // non-BLAS function
// BLAS level-2 includes
#include "internal/routines/level2/xgemv.h"
@@ -525,6 +526,37 @@ template StatusCode PUBLIC_API Max<double2>(const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
+template <typename T>
+StatusCode Min(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xmin<T>(queue_cpp, event);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoMin(n,
+ Buffer<T>(imin_buffer), imin_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+}
+template StatusCode PUBLIC_API Min<float>(const size_t,
+ cl_mem, const size_t,
+ const cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Min<double>(const size_t,
+ cl_mem, const size_t,
+ const cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Min<float2>(const size_t,
+ cl_mem, const size_t,
+ const cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Min<double2>(const size_t,
+ cl_mem, const size_t,
+ const cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// =================================================================================================
@@ -1880,6 +1912,7 @@ StatusCode FillCache(const cl_device_id device) {
Xsum<float>(queue, nullptr).SetUp(); Xsum<double>(queue, nullptr).SetUp(); Xsum<float2>(queue, nullptr).SetUp(); Xsum<double2>(queue, nullptr).SetUp();
Xamax<float>(queue, nullptr).SetUp(); Xamax<double>(queue, nullptr).SetUp(); Xamax<float2>(queue, nullptr).SetUp(); Xamax<double2>(queue, nullptr).SetUp();
Xmax<float>(queue, nullptr).SetUp(); Xmax<double>(queue, nullptr).SetUp(); Xmax<float2>(queue, nullptr).SetUp(); Xmax<double2>(queue, nullptr).SetUp();
+ Xmin<float>(queue, nullptr).SetUp(); Xmin<double>(queue, nullptr).SetUp(); Xmin<float2>(queue, nullptr).SetUp(); Xmin<double2>(queue, nullptr).SetUp();
// Runs all the level 2 set-up functions
Xgemv<float>(queue, nullptr).SetUp(); Xgemv<double>(queue, nullptr).SetUp(); Xgemv<float2>(queue, nullptr).SetUp(); Xgemv<double2>(queue, nullptr).SetUp();
diff --git a/src/clblast_c.cc b/src/clblast_c.cc
index 47ab1798..1fc63de2 100644
--- a/src/clblast_c.cc
+++ b/src/clblast_c.cc
@@ -601,6 +601,48 @@ StatusCode CLBlastiZmax(const size_t n,
return static_cast<StatusCode>(status);
}
+// MIN
+StatusCode CLBlastiSmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Min<float>(n,
+ imin_buffer, imin_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastiDmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Min<double>(n,
+ imin_buffer, imin_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastiCmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Min<float2>(n,
+ imin_buffer, imin_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastiZmin(const size_t n,
+ cl_mem imin_buffer, const size_t imin_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Min<double2>(n,
+ imin_buffer, imin_offset,
+ x_buffer, x_offset, x_inc,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
+
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// =================================================================================================
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 58b75ce2..48d0eb5c 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -41,7 +41,7 @@ __kernel void Xamax(const int n,
const int num_groups = get_num_groups(0);
// Performs loading and the first steps of the reduction
- #if defined(ROUTINE_MAX) // non-absolute version
+ #if defined(ROUTINE_MAX) || defined(ROUTINE_MIN) // non-absolute version
singlereal max = SMALLEST;
#else
singlereal max = ZERO;
@@ -55,7 +55,10 @@ __kernel void Xamax(const int n,
#else
singlereal x = xgm[x_index];
#endif
- #if defined(ROUTINE_MAX) // non-absolute version
+ #if defined(ROUTINE_MAX) // non-absolute maximum version
+ // nothing special here
+ #elif defined(ROUTINE_MIN) // non-absolute minimum version
+ x = -x;
#else
x = fabs(x);
#endif