summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt2
-rw-r--r--doc/clblast.md26
-rw-r--r--include/clblast.h5
-rw-r--r--include/clblast_c.h21
-rwxr-xr-xscripts/generator/generator.py2
-rw-r--r--scripts/generator/generator/routine.py15
-rw-r--r--src/clblast.cpp28
-rw-r--r--src/clblast_c.cpp40
-rw-r--r--src/routines/levelx/xaxpybatched.cpp59
-rw-r--r--src/routines/levelx/xaxpybatched.hpp46
-rw-r--r--test/routines/levelx/xaxpybatched.hpp167
11 files changed, 342 insertions, 69 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf905bc8..ef6156dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -159,7 +159,7 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
-set(LEVELX_ROUTINES xomatcopy)
+set(LEVELX_ROUTINES xomatcopy xaxpybatched)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
set(PRECISIONS 32 64 3232 6464 16)
diff --git a/doc/clblast.md b/doc/clblast.md
index eda5c07f..c919169a 100644
--- a/doc/clblast.md
+++ b/doc/clblast.md
@@ -2913,8 +2913,8 @@ C++ API:
template <typename T>
StatusCode AxpyBatched(const size_t n,
const T *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
```
@@ -2923,32 +2923,32 @@ C API:
```
CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
const float *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
const double *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
const cl_float2 *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
const cl_double2 *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
const cl_half *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
```
@@ -2958,10 +2958,8 @@ Arguments to AXPYBATCHED:
* `const size_t n`: Integer size argument. This value must be positive.
* `const T *alphas`: Input scalar constants.
* `const cl_mem *x_buffers`: OpenCL buffers to store the input x vectors.
-* `const size_t x_offset`: The offset in elements from the start of the input x vectors.
* `const size_t x_inc`: Stride/increment of the input x vectors. This value must be greater than 0.
* `cl_mem *y_buffers`: OpenCL buffers to store the output y vectors.
-* `const size_t y_offset`: The offset in elements from the start of the output y vectors.
* `const size_t y_inc`: Stride/increment of the output y vectors. This value must be greater than 0.
* `const size_t batch_count`: Number of batches. This value must be positive.
* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
diff --git a/include/clblast.h b/include/clblast.h
index aeea4e52..f3f73893 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -97,6 +97,7 @@ enum class StatusCode {
kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
// Custom additional status codes for CLBlast
+ kInvalidBatchCount = -2049, // The batch count needs to be positive
kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel
kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
@@ -613,8 +614,8 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
template <typename T>
StatusCode AxpyBatched(const size_t n,
const T *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event = nullptr);
diff --git a/include/clblast_c.h b/include/clblast_c.h
index f933ef6c..5c84b5d7 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -96,6 +96,7 @@ typedef enum CLBlastStatusCode_ {
CLBlastInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
// Custom additional status codes for CLBlast
+ CLBlastInvalidBatchCount = -2049, // The batch count needs to be positive
CLBlastInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel
CLBlastMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel
CLBlastInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
@@ -1330,32 +1331,32 @@ CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const
// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n,
const float *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n,
const double *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n,
const cl_float2 *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n,
const cl_double2 *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n,
const cl_half *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 521afe28..8dd5fc0c 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -41,7 +41,7 @@ FILES = [
"/include/clblast_netlib_c.h",
"/src/clblast_netlib_c.cpp",
]
-HEADER_LINES = [121, 76, 125, 23, 29, 41, 65, 32]
+HEADER_LINES = [122, 76, 126, 23, 29, 41, 65, 32]
FOOTER_LINES = [25, 138, 27, 38, 6, 6, 9, 2]
HEADER_LINES_DOC = 0
FOOTER_LINES_DOC = 63
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
index 9b07e671..8807fd8e 100644
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@@ -223,7 +223,7 @@ class Routine:
"""Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')"""
if name in self.inputs or name in self.outputs:
a = [name + "_buffer" + self.b_s()]
- b = [name + "_offset"]
+ b = [name + "_offset"] if not self.batched else []
c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
return [", ".join(a + b + c)]
return []
@@ -251,7 +251,7 @@ class Routine:
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
a = [prefix + "cl_mem " + self.b_star() + name + "_buffer" + self.b_s()]
- b = ["const size_t " + name + "_offset"]
+ b = ["const size_t " + name + "_offset"] if not self.batched else []
c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
return [", ".join(a + b + c)]
return []
@@ -295,7 +295,7 @@ class Routine:
a = [name + "_buffers_cpp"]
else:
a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"]
- b = [name + "_offset"]
+ b = [name + "_offset"] if not self.batched else []
c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
return [", ".join(a + b + c)]
return []
@@ -337,7 +337,7 @@ class Routine:
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
a = [prefix + "cl_mem" + self.b_star()]
- b = ["const size_t"]
+ b = ["const size_t"] if not self.batched else []
c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else []
return [", ".join(a + b + c)]
return []
@@ -350,12 +350,13 @@ class Routine:
math_name = name.upper() + " matrix" + self.b_s() if (name in self.buffers_matrix()) else name + " vector" + self.b_s()
inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment "
a = ["`" + prefix + "cl_mem " + self.b_star() + name + "_buffer" + self.b_s() + "`: OpenCL buffer" + self.b_s() + " to store the " + inout + " " + math_name + "."]
- b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."]
+ b = []
+ if not self.batched:
+ b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."]
+ c = []
if name not in self.buffers_without_ld_inc():
c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " +
inc_ld_description + "of the " + inout + " " + math_name + ". This value must be greater than 0."]
- else:
- c = []
return a + b + c
return []
diff --git a/src/clblast.cpp b/src/clblast.cpp
index 55562419..e9cac664 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -2178,8 +2178,8 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
template <typename T>
StatusCode AxpyBatched(const size_t n,
const T *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
try {
@@ -2195,40 +2195,40 @@ StatusCode AxpyBatched(const size_t n,
}
routine.DoAxpyBatched(n,
alphas_cpp,
- x_buffers_cpp, x_offset, x_inc,
- y_buffers_cpp, y_offset, y_inc,
+ x_buffers_cpp, x_inc,
+ y_buffers_cpp, y_inc,
batch_count);
return StatusCode::kSuccess;
} catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API AxpyBatched<float>(const size_t,
const float*,
- const cl_mem*, const size_t, const size_t,
- cl_mem*, const size_t, const size_t,
+ const cl_mem*, const size_t,
+ cl_mem*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API AxpyBatched<double>(const size_t,
const double*,
- const cl_mem*, const size_t, const size_t,
- cl_mem*, const size_t, const size_t,
+ const cl_mem*, const size_t,
+ cl_mem*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t,
const float2*,
- const cl_mem*, const size_t, const size_t,
- cl_mem*, const size_t, const size_t,
+ const cl_mem*, const size_t,
+ cl_mem*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t,
const double2*,
- const cl_mem*, const size_t, const size_t,
- cl_mem*, const size_t, const size_t,
+ const cl_mem*, const size_t,
+ cl_mem*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API AxpyBatched<half>(const size_t,
const half*,
- const cl_mem*, const size_t, const size_t,
- cl_mem*, const size_t, const size_t,
+ const cl_mem*, const size_t,
+ cl_mem*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp
index 83450e6f..bd8ea51a 100644
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@@ -3450,8 +3450,8 @@ CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTran
// AXPY
CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
const float *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<float>();
@@ -3462,8 +3462,8 @@ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
- x_buffers, x_offset, x_inc,
- y_buffers, y_offset, y_inc,
+ x_buffers, x_inc,
+ y_buffers, y_inc,
batch_count,
queue, event)
);
@@ -3471,8 +3471,8 @@ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
}
CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
const double *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<double>();
@@ -3483,8 +3483,8 @@ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
- x_buffers, x_offset, x_inc,
- y_buffers, y_offset, y_inc,
+ x_buffers, x_inc,
+ y_buffers, y_inc,
batch_count,
queue, event)
);
@@ -3492,8 +3492,8 @@ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
}
CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
const cl_float2 *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<float2>();
@@ -3504,8 +3504,8 @@ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
- x_buffers, x_offset, x_inc,
- y_buffers, y_offset, y_inc,
+ x_buffers, x_inc,
+ y_buffers, y_inc,
batch_count,
queue, event)
);
@@ -3513,8 +3513,8 @@ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
}
CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
const cl_double2 *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<double2>();
@@ -3525,8 +3525,8 @@ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
- x_buffers, x_offset, x_inc,
- y_buffers, y_offset, y_inc,
+ x_buffers, x_inc,
+ y_buffers, y_inc,
batch_count,
queue, event)
);
@@ -3534,8 +3534,8 @@ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
}
CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
const cl_half *alphas,
- const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+ const cl_mem *x_buffers, const size_t x_inc,
+ cl_mem *y_buffers, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<half>();
@@ -3546,8 +3546,8 @@ CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
- x_buffers, x_offset, x_inc,
- y_buffers, y_offset, y_inc,
+ x_buffers, x_inc,
+ y_buffers, y_inc,
batch_count,
queue, event)
);
diff --git a/src/routines/levelx/xaxpybatched.cpp b/src/routines/levelx/xaxpybatched.cpp
new file mode 100644
index 00000000..55458f43
--- /dev/null
+++ b/src/routines/levelx/xaxpybatched.cpp
@@ -0,0 +1,59 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the XaxpyBatched class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/levelx/xaxpybatched.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+XaxpyBatched<T>::XaxpyBatched(Queue &queue, EventPointer event, const std::string &name):
+ Xaxpy<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas,
+ const std::vector<Buffer<T>> &x_buffers, const size_t x_inc,
+ const std::vector<Buffer<T>> &y_buffers, const size_t y_inc,
+ const size_t batch_count) {
+ if (batch_count < 1) { throw BLASError(StatusCode::kInvalidBatchCount); }
+ if (alphas.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); }
+ if (x_buffers.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); }
+ if (y_buffers.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); }
+
+ // Naive implementation: calls regular Axpy multiple times
+ for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+ DoAxpy(n, alphas[batch],
+ x_buffers[batch], 0, x_inc,
+ y_buffers[batch], 0, y_inc);
+ }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class XaxpyBatched<half>;
+template class XaxpyBatched<float>;
+template class XaxpyBatched<double>;
+template class XaxpyBatched<float2>;
+template class XaxpyBatched<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/levelx/xaxpybatched.hpp b/src/routines/levelx/xaxpybatched.hpp
new file mode 100644
index 00000000..7fd14a74
--- /dev/null
+++ b/src/routines/levelx/xaxpybatched.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the XaxpyBatched routine. This is a non-blas batched version of AXPY.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAXPYBATCHED_H_
+#define CLBLAST_ROUTINES_XAXPYBATCHED_H_
+
+#include <vector>
+
+#include "routines/level1/xaxpy.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class XaxpyBatched: public Xaxpy<T> {
+ public:
+
+ // Uses the regular Xaxpy routine
+ using Xaxpy<T>::DoAxpy;
+
+ // Constructor
+ XaxpyBatched(Queue &queue, EventPointer event, const std::string &name = "AXPYBATCHED");
+
+ // Templated-precision implementation of the routine
+ void DoAxpyBatched(const size_t n, const std::vector<T> &alphas,
+ const std::vector<Buffer<T>> &x_buffers, const size_t x_inc,
+ const std::vector<Buffer<T>> &y_buffers, const size_t y_inc,
+ const size_t batch_count);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAXPYBATCHED_H_
+#endif
diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp
new file mode 100644
index 00000000..181b401c
--- /dev/null
+++ b/test/routines/levelx/xaxpybatched.hpp
@@ -0,0 +1,167 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the XaxpyBatched routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_
+#define CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_
+
+#include <vector>
+#include <string>
+
+#include "utilities/utilities.hpp"
+
+#ifdef CLBLAST_REF_CLBLAS
+ #include "test/wrapper_clblas.hpp"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "test/wrapper_cblas.hpp"
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXaxpyBatched {
+ public:
+
+ // Although it is a non-BLAS routine, it can still be tested against level-1 routines in a loop
+ static size_t BLASLevel() { return 1; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgN,
+ kArgXInc, kArgYInc,
+ kArgAlpha, kArgBatchCount};
+ }
+
+ // Helper to determine a different alpha value per batch
+ static T GetAlpha(const T alpha_base, const size_t batch_id) {
+ return alpha_base + Constant<T>(batch_id);
+ }
+
+ // Describes how to obtain the sizes of the buffers (per item, not for the full batch)
+ static size_t GetSizeX(const Arguments<T> &args) {
+ return args.n * args.x_inc;
+ }
+ static size_t GetSizeY(const Arguments<T> &args) {
+ return args.n * args.y_inc;
+ }
+
+ // Describes how to set the sizes of all the buffers (per item, not for the full batch)
+ static void SetSizes(Arguments<T> &args) {
+ args.x_size = GetSizeX(args);
+ args.y_size = GetSizeY(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to prepare the input data
+ static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+ std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+ std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto alphas = std::vector<T>();
+ auto x_buffers = std::vector<cl_mem>();
+ auto y_buffers = std::vector<cl_mem>();
+ for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+ alphas.push_back(GetAlpha(args.alpha, batch));
+ x_buffers.push_back(buffers[batch].x_vec());
+ y_buffers.push_back(buffers[batch].y_vec());
+ }
+ auto status = AxpyBatched(args.n, alphas.data(),
+ x_buffers.data(), args.x_inc,
+ y_buffers.data(), args.y_inc,
+ args.batch_count,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+ auto event = cl_event{};
+ auto status = clblasXaxpy(args.n, GetAlpha(args.alpha, batch),
+ buffers[batch].x_vec, 0, args.x_inc,
+ buffers[batch].y_vec, 0, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ if (static_cast<StatusCode>(status) != StatusCode::kSuccess) {
+ return static_cast<StatusCode>(status);
+ }
+ }
+ return StatusCode::kSuccess;
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
+ for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers[batch].x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers[batch].y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXaxpy(args.n, GetAlpha(args.alpha, batch),
+ x_vec_cpu, 0, args.x_inc,
+ y_vec_cpu, 0, args.y_inc);
+ buffers[batch].y_vec.Write(queue, args.y_size, y_vec_cpu);
+ }
+ return StatusCode::kSuccess;
+ }
+ #endif
+
+ // Describes how to download the results of the computation (per item, not for the full batch)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.y_size, static_cast<T>(0));
+ buffers.y_vec.Read(queue, args.y_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer (per item, not for the full batch)
+ static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1 * args.y_inc;
+ }
+
+ // Describes how to compute performance metrics (per item, not for the full batch)
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ return (3 * args.n) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_
+#endif