summaryrefslogtreecommitdiff
path: root/src/routines
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-10-22 15:05:12 +0200
committerGitHub <noreply@github.com>2016-10-22 15:05:12 +0200
commit280698d0767219e174b12e51e8e42b228bbf28e9 (patch)
tree25db4d2d360cc161ca7d8e563c847faf08a745a0 /src/routines
parent9b596820d2dd833648706bff505b459c58f45b4b (diff)
parent56f300607b1d0b81ab3269894fda5a066c46cdeb (diff)
Merge pull request #117 from intelfx/exceptions
Convert to use C++ exceptions internally
Diffstat (limited to 'src/routines')
-rw-r--r--src/routines/common.cpp27
-rw-r--r--src/routines/common.hpp158
-rw-r--r--src/routines/level1/xamax.cpp100
-rw-r--r--src/routines/level1/xamax.hpp6
-rw-r--r--src/routines/level1/xasum.cpp94
-rw-r--r--src/routines/level1/xasum.hpp6
-rw-r--r--src/routines/level1/xaxpy.cpp91
-rw-r--r--src/routines/level1/xaxpy.hpp6
-rw-r--r--src/routines/level1/xcopy.cpp87
-rw-r--r--src/routines/level1/xcopy.hpp6
-rw-r--r--src/routines/level1/xdot.cpp109
-rw-r--r--src/routines/level1/xdot.hpp10
-rw-r--r--src/routines/level1/xdotc.cpp16
-rw-r--r--src/routines/level1/xdotc.hpp8
-rw-r--r--src/routines/level1/xdotu.cpp16
-rw-r--r--src/routines/level1/xdotu.hpp8
-rw-r--r--src/routines/level1/xmax.hpp8
-rw-r--r--src/routines/level1/xmin.hpp8
-rw-r--r--src/routines/level1/xnrm2.cpp94
-rw-r--r--src/routines/level1/xnrm2.hpp6
-rw-r--r--src/routines/level1/xscal.cpp78
-rw-r--r--src/routines/level1/xscal.hpp4
-rw-r--r--src/routines/level1/xsum.hpp8
-rw-r--r--src/routines/level1/xswap.cpp87
-rw-r--r--src/routines/level1/xswap.hpp6
-rw-r--r--src/routines/level2/xgbmv.cpp28
-rw-r--r--src/routines/level2/xgbmv.hpp14
-rw-r--r--src/routines/level2/xgemv.cpp127
-rw-r--r--src/routines/level2/xgemv.hpp34
-rw-r--r--src/routines/level2/xger.cpp82
-rw-r--r--src/routines/level2/xger.hpp12
-rw-r--r--src/routines/level2/xgerc.cpp20
-rw-r--r--src/routines/level2/xgerc.hpp12
-rw-r--r--src/routines/level2/xgeru.cpp20
-rw-r--r--src/routines/level2/xgeru.hpp12
-rw-r--r--src/routines/level2/xhbmv.cpp28
-rw-r--r--src/routines/level2/xhbmv.hpp14
-rw-r--r--src/routines/level2/xhemv.cpp28
-rw-r--r--src/routines/level2/xhemv.hpp14
-rw-r--r--src/routines/level2/xher.cpp78
-rw-r--r--src/routines/level2/xher.hpp12
-rw-r--r--src/routines/level2/xher2.cpp87
-rw-r--r--src/routines/level2/xher2.hpp14
-rw-r--r--src/routines/level2/xhpmv.cpp28
-rw-r--r--src/routines/level2/xhpmv.hpp14
-rw-r--r--src/routines/level2/xhpr.cpp18
-rw-r--r--src/routines/level2/xhpr.hpp10
-rw-r--r--src/routines/level2/xhpr2.cpp22
-rw-r--r--src/routines/level2/xhpr2.hpp12
-rw-r--r--src/routines/level2/xsbmv.cpp28
-rw-r--r--src/routines/level2/xsbmv.hpp14
-rw-r--r--src/routines/level2/xspmv.cpp28
-rw-r--r--src/routines/level2/xspmv.hpp14
-rw-r--r--src/routines/level2/xspr.cpp18
-rw-r--r--src/routines/level2/xspr.hpp10
-rw-r--r--src/routines/level2/xspr2.cpp22
-rw-r--r--src/routines/level2/xspr2.hpp12
-rw-r--r--src/routines/level2/xsymv.cpp28
-rw-r--r--src/routines/level2/xsymv.hpp14
-rw-r--r--src/routines/level2/xsyr.cpp16
-rw-r--r--src/routines/level2/xsyr.hpp10
-rw-r--r--src/routines/level2/xsyr2.cpp20
-rw-r--r--src/routines/level2/xsyr2.hpp12
-rw-r--r--src/routines/level2/xtbmv.cpp44
-rw-r--r--src/routines/level2/xtbmv.hpp10
-rw-r--r--src/routines/level2/xtpmv.cpp44
-rw-r--r--src/routines/level2/xtpmv.hpp10
-rw-r--r--src/routines/level2/xtrmv.cpp44
-rw-r--r--src/routines/level2/xtrmv.hpp10
-rw-r--r--src/routines/level3/xgemm.cpp308
-rw-r--r--src/routines/level3/xgemm.hpp48
-rw-r--r--src/routines/level3/xhemm.cpp132
-rw-r--r--src/routines/level3/xhemm.hpp14
-rw-r--r--src/routines/level3/xher2k.cpp291
-rw-r--r--src/routines/level3/xher2k.hpp14
-rw-r--r--src/routines/level3/xherk.cpp201
-rw-r--r--src/routines/level3/xherk.hpp12
-rw-r--r--src/routines/level3/xsymm.cpp132
-rw-r--r--src/routines/level3/xsymm.hpp14
-rw-r--r--src/routines/level3/xsyr2k.cpp219
-rw-r--r--src/routines/level3/xsyr2k.hpp14
-rw-r--r--src/routines/level3/xsyrk.cpp169
-rw-r--r--src/routines/level3/xsyrk.hpp12
-rw-r--r--src/routines/level3/xtrmm.cpp134
-rw-r--r--src/routines/level3/xtrmm.hpp12
-rw-r--r--src/routines/levelx/xomatcopy.cpp32
-rw-r--r--src/routines/levelx/xomatcopy.hpp8
87 files changed, 1858 insertions, 2103 deletions
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index 3969cf9f..c995dc12 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -20,22 +20,26 @@ namespace clblast {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
- std::vector<size_t> global, const std::vector<size_t> &local,
- EventPointer event, const std::vector<Event> &waitForEvents) {
+void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+ std::vector<size_t> global, const std::vector<size_t> &local,
+ EventPointer event, const std::vector<Event> &waitForEvents) {
if (!local.empty()) {
// Tests for validity of the local thread sizes
if (local.size() > device.MaxWorkItemDimensions()) {
- return StatusCode::kInvalidLocalNumDimensions;
+ throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions);
}
const auto max_work_item_sizes = device.MaxWorkItemSizes();
for (auto i=size_t{0}; i<local.size(); ++i) {
- if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
+ if (local[i] > max_work_item_sizes[i]) {
+ throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim);
+ }
}
auto local_size = size_t{1};
for (auto &item: local) { local_size *= item; }
- if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
+ if (local_size > device.MaxWorkGroupSize()) {
+ throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
+ }
// Make sure the global thread sizes are at least equal to the local sizes
for (auto i=size_t{0}; i<global.size(); ++i) {
@@ -45,7 +49,9 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Tests for local memory usage
const auto local_mem_usage = kernel.LocalMemUsage(device);
- if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
+ if (!device.IsLocalMemoryValid(local_mem_usage)) {
+ throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage);
+ }
// Prints the name of the kernel to launch in case of debugging in verbose mode
#ifdef VERBOSE
@@ -55,9 +61,7 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
#endif
// Launches the kernel (and checks for launch errors)
- try {
- kernel.Launch(queue, global, local, event, waitForEvents);
- } catch (...) { return StatusCode::kKernelLaunchError; }
+ kernel.Launch(queue, global, local, event, waitForEvents);
// Prints the elapsed execution time in case of debugging in verbose mode
#ifdef VERBOSE
@@ -66,9 +70,6 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed kernel in %.2lf ms\n", timing);
#endif
-
- // No errors, normal termination of this function
- return StatusCode::kSuccess;
}
// =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 9d8849c3..802abec4 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -27,29 +27,29 @@ namespace clblast {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
- std::vector<size_t> global, const std::vector<size_t> &local,
- EventPointer event, const std::vector<Event> &waitForEvents = {});
+void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+ std::vector<size_t> global, const std::vector<size_t> &local,
+ EventPointer event, const std::vector<Event> &waitForEvents = {});
// =================================================================================================
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
// to write to symmetric and triangular matrices through optional arguments.
template <typename T>
-StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
- const Database &db,
- EventPointer event, const std::vector<Event> &waitForEvents,
- const size_t src_one, const size_t src_two,
- const size_t src_ld, const size_t src_offset,
- const Buffer<T> &src,
- const size_t dest_one, const size_t dest_two,
- const size_t dest_ld, const size_t dest_offset,
- const Buffer<T> &dest,
- const T alpha,
- const Program &program, const bool do_pad,
- const bool do_transpose, const bool do_conjugate,
- const bool upper = false, const bool lower = false,
- const bool diagonal_imag_zero = false) {
+void PadCopyTransposeMatrix(Queue &queue, const Device &device,
+ const Database &db,
+ EventPointer event, const std::vector<Event> &waitForEvents,
+ const size_t src_one, const size_t src_two,
+ const size_t src_ld, const size_t src_offset,
+ const Buffer<T> &src,
+ const size_t dest_one, const size_t dest_two,
+ const size_t dest_ld, const size_t dest_offset,
+ const Buffer<T> &dest,
+ const T alpha,
+ const Program &program, const bool do_pad,
+ const bool do_transpose, const bool do_conjugate,
+ const bool upper = false, const bool lower = false,
+ const bool diagonal_imag_zero = false) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
@@ -84,77 +84,75 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
// Retrieves the kernel from the compiled binary
- try {
- auto kernel = Kernel(program, kernel_name);
+ auto kernel = Kernel(program, kernel_name);
- // Sets the kernel arguments
- if (use_fast_kernel) {
- kernel.SetArgument(0, static_cast<int>(src_ld));
- kernel.SetArgument(1, src());
- kernel.SetArgument(2, dest());
- kernel.SetArgument(3, GetRealArg(alpha));
+ // Sets the kernel arguments
+ if (use_fast_kernel) {
+ kernel.SetArgument(0, static_cast<int>(src_ld));
+ kernel.SetArgument(1, src());
+ kernel.SetArgument(2, dest());
+ kernel.SetArgument(3, GetRealArg(alpha));
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(src_one));
+ kernel.SetArgument(1, static_cast<int>(src_two));
+ kernel.SetArgument(2, static_cast<int>(src_ld));
+ kernel.SetArgument(3, static_cast<int>(src_offset));
+ kernel.SetArgument(4, src());
+ kernel.SetArgument(5, static_cast<int>(dest_one));
+ kernel.SetArgument(6, static_cast<int>(dest_two));
+ kernel.SetArgument(7, static_cast<int>(dest_ld));
+ kernel.SetArgument(8, static_cast<int>(dest_offset));
+ kernel.SetArgument(9, dest());
+ kernel.SetArgument(10, GetRealArg(alpha));
+ if (do_pad) {
+ kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
- kernel.SetArgument(0, static_cast<int>(src_one));
- kernel.SetArgument(1, static_cast<int>(src_two));
- kernel.SetArgument(2, static_cast<int>(src_ld));
- kernel.SetArgument(3, static_cast<int>(src_offset));
- kernel.SetArgument(4, src());
- kernel.SetArgument(5, static_cast<int>(dest_one));
- kernel.SetArgument(6, static_cast<int>(dest_two));
- kernel.SetArgument(7, static_cast<int>(dest_ld));
- kernel.SetArgument(8, static_cast<int>(dest_offset));
- kernel.SetArgument(9, dest());
- kernel.SetArgument(10, GetRealArg(alpha));
- if (do_pad) {
- kernel.SetArgument(11, static_cast<int>(do_conjugate));
- }
- else {
- kernel.SetArgument(11, static_cast<int>(upper));
- kernel.SetArgument(12, static_cast<int>(lower));
- kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
- }
+ kernel.SetArgument(11, static_cast<int>(upper));
+ kernel.SetArgument(12, static_cast<int>(lower));
+ kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
}
+ }
- // Launches the kernel and returns the error code. Uses global and local thread sizes based on
- // parameters in the database.
- if (do_transpose) {
- if (use_fast_kernel) {
- const auto global = std::vector<size_t>{
- dest_one / db["TRA_WPT"],
- dest_two / db["TRA_WPT"]
- };
- const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
- return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
- }
- else {
- const auto global = std::vector<size_t>{
- Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
- Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
- };
- const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
- return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
- }
+ // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+ // parameters in the database.
+ if (do_transpose) {
+ if (use_fast_kernel) {
+ const auto global = std::vector<size_t>{
+ dest_one / db["TRA_WPT"],
+ dest_two / db["TRA_WPT"]
+ };
+ const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
+ RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
- if (use_fast_kernel) {
- const auto global = std::vector<size_t>{
- dest_one / db["COPY_VW"],
- dest_two / db["COPY_WPT"]
- };
- const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
- return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
- }
- else {
- const auto global = std::vector<size_t>{
- Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
- Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
- };
- const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
- return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
- }
+ const auto global = std::vector<size_t>{
+ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+ Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
+ };
+ const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
+ RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
- } catch (...) { return StatusCode::kInvalidKernel; }
+ }
+ else {
+ if (use_fast_kernel) {
+ const auto global = std::vector<size_t>{
+ dest_one / db["COPY_VW"],
+ dest_two / db["COPY_WPT"]
+ };
+ const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
+ RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+ }
+ else {
+ const auto global = std::vector<size_t>{
+ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
+ Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
+ };
+ const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
+ RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+ }
+ }
}
// =================================================================================================
diff --git a/src/routines/level1/xamax.cpp b/src/routines/level1/xamax.cpp
index 6b6e7f9e..e9efa1a7 100644
--- a/src/routines/level1/xamax.cpp
+++ b/src/routines/level1/xamax.cpp
@@ -22,74 +22,64 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xamax.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xamax<T>::DoAmax(const size_t n,
- const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xamax<T>::DoAmax(const size_t n,
+ const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorIndex(1, imax_buffer, imax_offset);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorIndex(1, imax_buffer, imax_offset);
// Retrieves the Xamax kernels from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel1 = Kernel(program, "Xamax");
- auto kernel2 = Kernel(program, "XamaxEpilogue");
-
- // Creates the buffer for intermediate values
- auto temp_size = 2*db_["WGS2"];
- auto temp_buffer1 = Buffer<T>(context_, temp_size);
- auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
-
- // Sets the kernel arguments
- kernel1.SetArgument(0, static_cast<int>(n));
- kernel1.SetArgument(1, x_buffer());
- kernel1.SetArgument(2, static_cast<int>(x_offset));
- kernel1.SetArgument(3, static_cast<int>(x_inc));
- kernel1.SetArgument(4, temp_buffer1());
- kernel1.SetArgument(5, temp_buffer2());
-
- // Event waiting list
- auto eventWaitList = std::vector<Event>();
-
- // Launches the main kernel
- auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
- auto local1 = std::vector<size_t>{db_["WGS1"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(kernelEvent);
-
- // Sets the arguments for the epilogue kernel
- kernel2.SetArgument(0, temp_buffer1());
- kernel2.SetArgument(1, temp_buffer2());
- kernel2.SetArgument(2, imax_buffer());
- kernel2.SetArgument(3, static_cast<int>(imax_offset));
-
- // Launches the epilogue kernel
- auto global2 = std::vector<size_t>{db_["WGS2"]};
- auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel1 = Kernel(program, "Xamax");
+ auto kernel2 = Kernel(program, "XamaxEpilogue");
+
+ // Creates the buffer for intermediate values
+ auto temp_size = 2*db_["WGS2"];
+ auto temp_buffer1 = Buffer<T>(context_, temp_size);
+ auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
+
+ // Sets the kernel arguments
+ kernel1.SetArgument(0, static_cast<int>(n));
+ kernel1.SetArgument(1, x_buffer());
+ kernel1.SetArgument(2, static_cast<int>(x_offset));
+ kernel1.SetArgument(3, static_cast<int>(x_inc));
+ kernel1.SetArgument(4, temp_buffer1());
+ kernel1.SetArgument(5, temp_buffer2());
+
+ // Event waiting list
+ auto eventWaitList = std::vector<Event>();
+
+ // Launches the main kernel
+ auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+ auto local1 = std::vector<size_t>{db_["WGS1"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+ eventWaitList.push_back(kernelEvent);
+
+ // Sets the arguments for the epilogue kernel
+ kernel2.SetArgument(0, temp_buffer1());
+ kernel2.SetArgument(1, temp_buffer2());
+ kernel2.SetArgument(2, imax_buffer());
+ kernel2.SetArgument(3, static_cast<int>(imax_offset));
+
+ // Launches the epilogue kernel
+ auto global2 = std::vector<size_t>{db_["WGS2"]};
+ auto local2 = std::vector<size_t>{db_["WGS2"]};
+ RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================
diff --git a/src/routines/level1/xamax.hpp b/src/routines/level1/xamax.hpp
index aa45a8e4..4d1e0082 100644
--- a/src/routines/level1/xamax.hpp
+++ b/src/routines/level1/xamax.hpp
@@ -28,9 +28,9 @@ class Xamax: public Routine {
Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
// Templated-precision implementation of the routine
- StatusCode DoAmax(const size_t n,
- const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoAmax(const size_t n,
+ const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xasum.cpp b/src/routines/level1/xasum.cpp
index 0c1ce903..a242a5fa 100644
--- a/src/routines/level1/xasum.cpp
+++ b/src/routines/level1/xasum.cpp
@@ -22,71 +22,61 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xasum.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xasum<T>::DoAsum(const size_t n,
- const Buffer<T> &asum_buffer, const size_t asum_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xasum<T>::DoAsum(const size_t n,
+ const Buffer<T> &asum_buffer, const size_t asum_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorScalar(1, asum_buffer, asum_offset);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorScalar(1, asum_buffer, asum_offset);
// Retrieves the Xasum kernels from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel1 = Kernel(program, "Xasum");
- auto kernel2 = Kernel(program, "XasumEpilogue");
-
- // Creates the buffer for intermediate values
- auto temp_size = 2*db_["WGS2"];
- auto temp_buffer = Buffer<T>(context_, temp_size);
-
- // Sets the kernel arguments
- kernel1.SetArgument(0, static_cast<int>(n));
- kernel1.SetArgument(1, x_buffer());
- kernel1.SetArgument(2, static_cast<int>(x_offset));
- kernel1.SetArgument(3, static_cast<int>(x_inc));
- kernel1.SetArgument(4, temp_buffer());
-
- // Event waiting list
- auto eventWaitList = std::vector<Event>();
-
- // Launches the main kernel
- auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
- auto local1 = std::vector<size_t>{db_["WGS1"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(kernelEvent);
-
- // Sets the arguments for the epilogue kernel
- kernel2.SetArgument(0, temp_buffer());
- kernel2.SetArgument(1, asum_buffer());
- kernel2.SetArgument(2, static_cast<int>(asum_offset));
-
- // Launches the epilogue kernel
- auto global2 = std::vector<size_t>{db_["WGS2"]};
- auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel1 = Kernel(program, "Xasum");
+ auto kernel2 = Kernel(program, "XasumEpilogue");
+
+ // Creates the buffer for intermediate values
+ auto temp_size = 2*db_["WGS2"];
+ auto temp_buffer = Buffer<T>(context_, temp_size);
+
+ // Sets the kernel arguments
+ kernel1.SetArgument(0, static_cast<int>(n));
+ kernel1.SetArgument(1, x_buffer());
+ kernel1.SetArgument(2, static_cast<int>(x_offset));
+ kernel1.SetArgument(3, static_cast<int>(x_inc));
+ kernel1.SetArgument(4, temp_buffer());
+
+ // Event waiting list
+ auto eventWaitList = std::vector<Event>();
+
+ // Launches the main kernel
+ auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+ auto local1 = std::vector<size_t>{db_["WGS1"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+ eventWaitList.push_back(kernelEvent);
+
+ // Sets the arguments for the epilogue kernel
+ kernel2.SetArgument(0, temp_buffer());
+ kernel2.SetArgument(1, asum_buffer());
+ kernel2.SetArgument(2, static_cast<int>(asum_offset));
+
+ // Launches the epilogue kernel
+ auto global2 = std::vector<size_t>{db_["WGS2"]};
+ auto local2 = std::vector<size_t>{db_["WGS2"]};
+ RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================
diff --git a/src/routines/level1/xasum.hpp b/src/routines/level1/xasum.hpp
index 5a253f4d..0afcc4ff 100644
--- a/src/routines/level1/xasum.hpp
+++ b/src/routines/level1/xasum.hpp
@@ -28,9 +28,9 @@ class Xasum: public Routine {
Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
// Templated-precision implementation of the routine
- StatusCode DoAsum(const size_t n,
- const Buffer<T> &asum_buffer, const size_t asum_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoAsum(const size_t n,
+ const Buffer<T> &asum_buffer, const size_t asum_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp
index 3445e2b5..5436c5b7 100644
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@@ -22,29 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xaxpy.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(n, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@@ -55,45 +52,39 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
// Retrieves the Xaxpy kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- if (use_fast_kernel) {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, GetRealArg(alpha));
- kernel.SetArgument(2, x_buffer());
- kernel.SetArgument(3, y_buffer());
- }
- else {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, GetRealArg(alpha));
- kernel.SetArgument(2, x_buffer());
- kernel.SetArgument(3, static_cast<int>(x_offset));
- kernel.SetArgument(4, static_cast<int>(x_inc));
- kernel.SetArgument(5, y_buffer());
- kernel.SetArgument(6, static_cast<int>(y_offset));
- kernel.SetArgument(7, static_cast<int>(y_inc));
- }
-
- // Launches the kernel
- if (use_fast_kernel) {
- auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- else {
- auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
- auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ if (use_fast_kernel) {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, GetRealArg(alpha));
+ kernel.SetArgument(2, x_buffer());
+ kernel.SetArgument(3, y_buffer());
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, GetRealArg(alpha));
+ kernel.SetArgument(2, x_buffer());
+ kernel.SetArgument(3, static_cast<int>(x_offset));
+ kernel.SetArgument(4, static_cast<int>(x_inc));
+ kernel.SetArgument(5, y_buffer());
+ kernel.SetArgument(6, static_cast<int>(y_offset));
+ kernel.SetArgument(7, static_cast<int>(y_inc));
+ }
+
+ // Launches the kernel
+ if (use_fast_kernel) {
+ auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
+ else {
+ auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+ auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
}
// =================================================================================================
diff --git a/src/routines/level1/xaxpy.hpp b/src/routines/level1/xaxpy.hpp
index caac871e..9b30dfaa 100644
--- a/src/routines/level1/xaxpy.hpp
+++ b/src/routines/level1/xaxpy.hpp
@@ -28,9 +28,9 @@ class Xaxpy: public Routine {
Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
// Templated-precision implementation of the routine
- StatusCode DoAxpy(const size_t n, const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoAxpy(const size_t n, const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xcopy.cpp b/src/routines/level1/xcopy.cpp
index 673ef349..d86200c0 100644
--- a/src/routines/level1/xcopy.cpp
+++ b/src/routines/level1/xcopy.cpp
@@ -22,29 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xcopy.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xcopy<T>::DoCopy(const size_t n,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xcopy<T>::DoCopy(const size_t n,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(n, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@@ -55,43 +52,37 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy";
// Retrieves the Xcopy kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- if (use_fast_kernel) {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, x_buffer());
- kernel.SetArgument(2, y_buffer());
- }
- else {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, x_buffer());
- kernel.SetArgument(2, static_cast<int>(x_offset));
- kernel.SetArgument(3, static_cast<int>(x_inc));
- kernel.SetArgument(4, y_buffer());
- kernel.SetArgument(5, static_cast<int>(y_offset));
- kernel.SetArgument(6, static_cast<int>(y_inc));
- }
-
- // Launches the kernel
- if (use_fast_kernel) {
- auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- else {
- auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
- auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ if (use_fast_kernel) {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, x_buffer());
+ kernel.SetArgument(2, y_buffer());
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, x_buffer());
+ kernel.SetArgument(2, static_cast<int>(x_offset));
+ kernel.SetArgument(3, static_cast<int>(x_inc));
+ kernel.SetArgument(4, y_buffer());
+ kernel.SetArgument(5, static_cast<int>(y_offset));
+ kernel.SetArgument(6, static_cast<int>(y_inc));
+ }
+
+ // Launches the kernel
+ if (use_fast_kernel) {
+ auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
+ else {
+ auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+ auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
}
// =================================================================================================
diff --git a/src/routines/level1/xcopy.hpp b/src/routines/level1/xcopy.hpp
index 0c424ba3..a6454fcc 100644
--- a/src/routines/level1/xcopy.hpp
+++ b/src/routines/level1/xcopy.hpp
@@ -28,9 +28,9 @@ class Xcopy: public Routine {
Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
// Templated-precision implementation of the routine
- StatusCode DoCopy(const size_t n,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoCopy(const size_t n,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xdot.cpp b/src/routines/level1/xdot.cpp
index bafea157..9d718913 100644
--- a/src/routines/level1/xdot.cpp
+++ b/src/routines/level1/xdot.cpp
@@ -22,79 +22,68 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xdot.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xdot<T>::DoDot(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const bool do_conjugate) {
+void Xdot<T>::DoDot(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const bool do_conjugate) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(n, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorScalar(1, dot_buffer, dot_offset);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
+ TestVectorScalar(1, dot_buffer, dot_offset);
// Retrieves the Xdot kernels from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel1 = Kernel(program, "Xdot");
- auto kernel2 = Kernel(program, "XdotEpilogue");
-
- // Creates the buffer for intermediate values
- auto temp_size = 2*db_["WGS2"];
- auto temp_buffer = Buffer<T>(context_, temp_size);
-
- // Sets the kernel arguments
- kernel1.SetArgument(0, static_cast<int>(n));
- kernel1.SetArgument(1, x_buffer());
- kernel1.SetArgument(2, static_cast<int>(x_offset));
- kernel1.SetArgument(3, static_cast<int>(x_inc));
- kernel1.SetArgument(4, y_buffer());
- kernel1.SetArgument(5, static_cast<int>(y_offset));
- kernel1.SetArgument(6, static_cast<int>(y_inc));
- kernel1.SetArgument(7, temp_buffer());
- kernel1.SetArgument(8, static_cast<int>(do_conjugate));
-
- // Event waiting list
- auto eventWaitList = std::vector<Event>();
-
- // Launches the main kernel
- auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
- auto local1 = std::vector<size_t>{db_["WGS1"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(kernelEvent);
-
- // Sets the arguments for the epilogue kernel
- kernel2.SetArgument(0, temp_buffer());
- kernel2.SetArgument(1, dot_buffer());
- kernel2.SetArgument(2, static_cast<int>(dot_offset));
-
- // Launches the epilogue kernel
- auto global2 = std::vector<size_t>{db_["WGS2"]};
- auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel1 = Kernel(program, "Xdot");
+ auto kernel2 = Kernel(program, "XdotEpilogue");
+
+ // Creates the buffer for intermediate values
+ auto temp_size = 2*db_["WGS2"];
+ auto temp_buffer = Buffer<T>(context_, temp_size);
+
+ // Sets the kernel arguments
+ kernel1.SetArgument(0, static_cast<int>(n));
+ kernel1.SetArgument(1, x_buffer());
+ kernel1.SetArgument(2, static_cast<int>(x_offset));
+ kernel1.SetArgument(3, static_cast<int>(x_inc));
+ kernel1.SetArgument(4, y_buffer());
+ kernel1.SetArgument(5, static_cast<int>(y_offset));
+ kernel1.SetArgument(6, static_cast<int>(y_inc));
+ kernel1.SetArgument(7, temp_buffer());
+ kernel1.SetArgument(8, static_cast<int>(do_conjugate));
+
+ // Event waiting list
+ auto eventWaitList = std::vector<Event>();
+
+ // Launches the main kernel
+ auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+ auto local1 = std::vector<size_t>{db_["WGS1"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+ eventWaitList.push_back(kernelEvent);
+
+ // Sets the arguments for the epilogue kernel
+ kernel2.SetArgument(0, temp_buffer());
+ kernel2.SetArgument(1, dot_buffer());
+ kernel2.SetArgument(2, static_cast<int>(dot_offset));
+
+ // Launches the epilogue kernel
+ auto global2 = std::vector<size_t>{db_["WGS2"]};
+ auto local2 = std::vector<size_t>{db_["WGS2"]};
+ RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================
diff --git a/src/routines/level1/xdot.hpp b/src/routines/level1/xdot.hpp
index 02c1efaa..a4c9dfa0 100644
--- a/src/routines/level1/xdot.hpp
+++ b/src/routines/level1/xdot.hpp
@@ -28,11 +28,11 @@ class Xdot: public Routine {
Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
// Templated-precision implementation of the routine
- StatusCode DoDot(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const bool do_conjugate = false);
+ void DoDot(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const bool do_conjugate = false);
};
// =================================================================================================
diff --git a/src/routines/level1/xdotc.cpp b/src/routines/level1/xdotc.cpp
index 27cf2bab..5a4e939a 100644
--- a/src/routines/level1/xdotc.cpp
+++ b/src/routines/level1/xdotc.cpp
@@ -29,14 +29,14 @@ Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xdotc<T>::DoDotc(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
- return DoDot(n, dot_buffer, dot_offset,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- true);
+void Xdotc<T>::DoDotc(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+ DoDot(n, dot_buffer, dot_offset,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ true);
}
// =================================================================================================
diff --git a/src/routines/level1/xdotc.hpp b/src/routines/level1/xdotc.hpp
index b8cbdaf5..ab7465f5 100644
--- a/src/routines/level1/xdotc.hpp
+++ b/src/routines/level1/xdotc.hpp
@@ -31,10 +31,10 @@ class Xdotc: public Xdot<T> {
Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");
// Templated-precision implementation of the routine
- StatusCode DoDotc(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoDotc(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xdotu.cpp b/src/routines/level1/xdotu.cpp
index 0bce70b7..b9d8bcef 100644
--- a/src/routines/level1/xdotu.cpp
+++ b/src/routines/level1/xdotu.cpp
@@ -28,14 +28,14 @@ Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xdotu<T>::DoDotu(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
- return DoDot(n, dot_buffer, dot_offset,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- false);
+void Xdotu<T>::DoDotu(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+ DoDot(n, dot_buffer, dot_offset,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ false);
}
// =================================================================================================
diff --git a/src/routines/level1/xdotu.hpp b/src/routines/level1/xdotu.hpp
index b3f73086..cad91c58 100644
--- a/src/routines/level1/xdotu.hpp
+++ b/src/routines/level1/xdotu.hpp
@@ -31,10 +31,10 @@ class Xdotu: public Xdot<T> {
Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");
// Templated-precision implementation of the routine
- StatusCode DoDotu(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoDotu(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xmax.hpp b/src/routines/level1/xmax.hpp
index 5a0236f2..2b7a5ae7 100644
--- a/src/routines/level1/xmax.hpp
+++ b/src/routines/level1/xmax.hpp
@@ -35,10 +35,10 @@ class Xmax: public Xamax<T> {
// Forwards to the regular absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
- StatusCode DoMax(const size_t n,
- const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
- return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
+ void DoMax(const size_t n,
+ const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
}
};
diff --git a/src/routines/level1/xmin.hpp b/src/routines/level1/xmin.hpp
index 6befec64..47a195ea 100644
--- a/src/routines/level1/xmin.hpp
+++ b/src/routines/level1/xmin.hpp
@@ -35,10 +35,10 @@ class Xmin: public Xamax<T> {
// Forwards to the regular max-absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
- StatusCode DoMin(const size_t n,
- const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
- return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
+ void DoMin(const size_t n,
+ const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
}
};
diff --git a/src/routines/level1/xnrm2.cpp b/src/routines/level1/xnrm2.cpp
index 97615d8b..373820a4 100644
--- a/src/routines/level1/xnrm2.cpp
+++ b/src/routines/level1/xnrm2.cpp
@@ -22,71 +22,61 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xnrm2.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xnrm2<T>::DoNrm2(const size_t n,
- const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xnrm2<T>::DoNrm2(const size_t n,
+ const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorScalar(1, nrm2_buffer, nrm2_offset);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorScalar(1, nrm2_buffer, nrm2_offset);
// Retrieves the Xnrm2 kernels from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel1 = Kernel(program, "Xnrm2");
- auto kernel2 = Kernel(program, "Xnrm2Epilogue");
-
- // Creates the buffer for intermediate values
- auto temp_size = 2*db_["WGS2"];
- auto temp_buffer = Buffer<T>(context_, temp_size);
-
- // Sets the kernel arguments
- kernel1.SetArgument(0, static_cast<int>(n));
- kernel1.SetArgument(1, x_buffer());
- kernel1.SetArgument(2, static_cast<int>(x_offset));
- kernel1.SetArgument(3, static_cast<int>(x_inc));
- kernel1.SetArgument(4, temp_buffer());
-
- // Event waiting list
- auto eventWaitList = std::vector<Event>();
-
- // Launches the main kernel
- auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
- auto local1 = std::vector<size_t>{db_["WGS1"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(kernelEvent);
-
- // Sets the arguments for the epilogue kernel
- kernel2.SetArgument(0, temp_buffer());
- kernel2.SetArgument(1, nrm2_buffer());
- kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
-
- // Launches the epilogue kernel
- auto global2 = std::vector<size_t>{db_["WGS2"]};
- auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel1 = Kernel(program, "Xnrm2");
+ auto kernel2 = Kernel(program, "Xnrm2Epilogue");
+
+ // Creates the buffer for intermediate values
+ auto temp_size = 2*db_["WGS2"];
+ auto temp_buffer = Buffer<T>(context_, temp_size);
+
+ // Sets the kernel arguments
+ kernel1.SetArgument(0, static_cast<int>(n));
+ kernel1.SetArgument(1, x_buffer());
+ kernel1.SetArgument(2, static_cast<int>(x_offset));
+ kernel1.SetArgument(3, static_cast<int>(x_inc));
+ kernel1.SetArgument(4, temp_buffer());
+
+ // Event waiting list
+ auto eventWaitList = std::vector<Event>();
+
+ // Launches the main kernel
+ auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+ auto local1 = std::vector<size_t>{db_["WGS1"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+ eventWaitList.push_back(kernelEvent);
+
+ // Sets the arguments for the epilogue kernel
+ kernel2.SetArgument(0, temp_buffer());
+ kernel2.SetArgument(1, nrm2_buffer());
+ kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
+
+ // Launches the epilogue kernel
+ auto global2 = std::vector<size_t>{db_["WGS2"]};
+ auto local2 = std::vector<size_t>{db_["WGS2"]};
+ RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================
diff --git a/src/routines/level1/xnrm2.hpp b/src/routines/level1/xnrm2.hpp
index 7baf07f5..3183ce24 100644
--- a/src/routines/level1/xnrm2.hpp
+++ b/src/routines/level1/xnrm2.hpp
@@ -28,9 +28,9 @@ class Xnrm2: public Routine {
Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
// Templated-precision implementation of the routine
- StatusCode DoNrm2(const size_t n,
- const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoNrm2(const size_t n,
+ const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp
index bcc43c3b..17410f01 100644
--- a/src/routines/level1/xscal.cpp
+++ b/src/routines/level1/xscal.cpp
@@ -22,26 +22,24 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xscal.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xscal<T>::DoScal(const size_t n, const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vector for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@@ -51,41 +49,35 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal";
// Retrieves the Xscal kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- if (use_fast_kernel) {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, alpha);
- kernel.SetArgument(2, x_buffer());
- }
- else {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, alpha);
- kernel.SetArgument(2, x_buffer());
- kernel.SetArgument(3, static_cast<int>(x_offset));
- kernel.SetArgument(4, static_cast<int>(x_inc));
- }
-
- // Launches the kernel
- if (use_fast_kernel) {
- auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- else {
- auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
- auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ if (use_fast_kernel) {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, alpha);
+ kernel.SetArgument(2, x_buffer());
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, alpha);
+ kernel.SetArgument(2, x_buffer());
+ kernel.SetArgument(3, static_cast<int>(x_offset));
+ kernel.SetArgument(4, static_cast<int>(x_inc));
+ }
+
+ // Launches the kernel
+ if (use_fast_kernel) {
+ auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
+ else {
+ auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+ auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
}
// =================================================================================================
diff --git a/src/routines/level1/xscal.hpp b/src/routines/level1/xscal.hpp
index 6c585cb2..02c847cc 100644
--- a/src/routines/level1/xscal.hpp
+++ b/src/routines/level1/xscal.hpp
@@ -28,8 +28,8 @@ class Xscal: public Routine {
Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
// Templated-precision implementation of the routine
- StatusCode DoScal(const size_t n, const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoScal(const size_t n, const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level1/xsum.hpp b/src/routines/level1/xsum.hpp
index 84e20bea..a69d6511 100644
--- a/src/routines/level1/xsum.hpp
+++ b/src/routines/level1/xsum.hpp
@@ -35,10 +35,10 @@ class Xsum: public Xasum<T> {
// Forwards to the regular absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
- StatusCode DoSum(const size_t n,
- const Buffer<T> &sum_buffer, const size_t sum_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
- return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
+ void DoSum(const size_t n,
+ const Buffer<T> &sum_buffer, const size_t sum_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
}
};
diff --git a/src/routines/level1/xswap.cpp b/src/routines/level1/xswap.cpp
index 03907cbd..c9b97dc9 100644
--- a/src/routines/level1/xswap.cpp
+++ b/src/routines/level1/xswap.cpp
@@ -22,29 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xswap.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xswap<T>::DoSwap(const size_t n,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xswap<T>::DoSwap(const size_t n,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
- auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(n, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@@ -55,43 +52,37 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap";
// Retrieves the Xswap kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- if (use_fast_kernel) {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, x_buffer());
- kernel.SetArgument(2, y_buffer());
- }
- else {
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, x_buffer());
- kernel.SetArgument(2, static_cast<int>(x_offset));
- kernel.SetArgument(3, static_cast<int>(x_inc));
- kernel.SetArgument(4, y_buffer());
- kernel.SetArgument(5, static_cast<int>(y_offset));
- kernel.SetArgument(6, static_cast<int>(y_inc));
- }
-
- // Launches the kernel
- if (use_fast_kernel) {
- auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- else {
- auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
- auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
- auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- }
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ if (use_fast_kernel) {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, x_buffer());
+ kernel.SetArgument(2, y_buffer());
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, x_buffer());
+ kernel.SetArgument(2, static_cast<int>(x_offset));
+ kernel.SetArgument(3, static_cast<int>(x_inc));
+ kernel.SetArgument(4, y_buffer());
+ kernel.SetArgument(5, static_cast<int>(y_offset));
+ kernel.SetArgument(6, static_cast<int>(y_inc));
+ }
+
+ // Launches the kernel
+ if (use_fast_kernel) {
+ auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
+ else {
+ auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+ auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
}
// =================================================================================================
diff --git a/src/routines/level1/xswap.hpp b/src/routines/level1/xswap.hpp
index 4f9ea36d..eadf58e5 100644
--- a/src/routines/level1/xswap.hpp
+++ b/src/routines/level1/xswap.hpp
@@ -28,9 +28,9 @@ class Xswap: public Routine {
Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
// Templated-precision implementation of the routine
- StatusCode DoSwap(const size_t n,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoSwap(const size_t n,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xgbmv.cpp b/src/routines/level2/xgbmv.cpp
index ea4f001c..e80b9a96 100644
--- a/src/routines/level2/xgbmv.cpp
+++ b/src/routines/level2/xgbmv.cpp
@@ -29,13 +29,13 @@ Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Reverses the upper and lower band count
auto rotated = (layout == Layout::kRowMajor);
@@ -46,13 +46,13 @@ StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
// The specific hermitian matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_GBMV define.
bool fast_kernels = false;
- return MatVec(layout, a_transpose,
- m, n, alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- 0, false, kl_real, ku_real);
+ MatVec(layout, a_transpose,
+ m, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ 0, false, kl_real, ku_real);
}
// =================================================================================================
diff --git a/src/routines/level2/xgbmv.hpp b/src/routines/level2/xgbmv.hpp
index 686ab642..e5f670ec 100644
--- a/src/routines/level2/xgbmv.hpp
+++ b/src/routines/level2/xgbmv.hpp
@@ -33,13 +33,13 @@ class Xgbmv: public Xgemv<T> {
Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");
// Templated-precision implementation of the routine
- StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoGbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 4e32ba41..7b4c2e8f 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -22,52 +22,51 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/xgemv.opencl"
#include "../../kernels/level2/xgemv_fast.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Performs the matrix-vector multiplication
- return MatVec(layout, a_transpose,
- m, n, alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- true, true,
- 0, false, 0, 0); // N/A for this routine
+ MatVec(layout, a_transpose,
+ m, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ true, true,
+ 0, false, 0, 0); // N/A for this routine
}
// =================================================================================================
// The generic implementation, also suited for other (non general) matrix-vector multiplications
template <typename T>
-StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- bool fast_kernel, bool fast_kernel_rot,
- const size_t parameter, const bool packed,
- const size_t kl, const size_t ku) {
+void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ bool fast_kernel, bool fast_kernel_rot,
+ const size_t parameter, const bool packed,
+ const size_t kl, const size_t ku) {
// Makes sure all dimensions are larger than zero
- if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
+ if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrix has an alternative layout (row or column-major).
auto a_altlayout = (layout == Layout::kRowMajor);
@@ -91,14 +90,10 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
auto a_conjugate = (a_transpose == Transpose::kConjugate);
// Tests the matrix and the vectors for validity
- auto status = StatusCode::kSuccess;
- if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
- else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
- if (ErrorIn(status)) { return status; }
- status = TestVectorX(n_real, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(m_real, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
+ if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
+ else { TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
+ TestVectorX(n_real, x_buffer, x_offset, x_inc);
+ TestVectorY(m_real, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
@@ -127,39 +122,33 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
}
// Retrieves the Xgemv kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(m_real));
- kernel.SetArgument(1, static_cast<int>(n_real));
- kernel.SetArgument(2, GetRealArg(alpha));
- kernel.SetArgument(3, GetRealArg(beta));
- kernel.SetArgument(4, static_cast<int>(a_rotated));
- kernel.SetArgument(5, a_buffer());
- kernel.SetArgument(6, static_cast<int>(a_offset));
- kernel.SetArgument(7, static_cast<int>(a_ld));
- kernel.SetArgument(8, x_buffer());
- kernel.SetArgument(9, static_cast<int>(x_offset));
- kernel.SetArgument(10, static_cast<int>(x_inc));
- kernel.SetArgument(11, y_buffer());
- kernel.SetArgument(12, static_cast<int>(y_offset));
- kernel.SetArgument(13, static_cast<int>(y_inc));
- kernel.SetArgument(14, static_cast<int>(a_conjugate));
- kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
- kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
- kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
-
- // Launches the kernel
- auto global = std::vector<size_t>{global_size};
- auto local = std::vector<size_t>{local_size};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(m_real));
+ kernel.SetArgument(1, static_cast<int>(n_real));
+ kernel.SetArgument(2, GetRealArg(alpha));
+ kernel.SetArgument(3, GetRealArg(beta));
+ kernel.SetArgument(4, static_cast<int>(a_rotated));
+ kernel.SetArgument(5, a_buffer());
+ kernel.SetArgument(6, static_cast<int>(a_offset));
+ kernel.SetArgument(7, static_cast<int>(a_ld));
+ kernel.SetArgument(8, x_buffer());
+ kernel.SetArgument(9, static_cast<int>(x_offset));
+ kernel.SetArgument(10, static_cast<int>(x_inc));
+ kernel.SetArgument(11, y_buffer());
+ kernel.SetArgument(12, static_cast<int>(y_offset));
+ kernel.SetArgument(13, static_cast<int>(y_inc));
+ kernel.SetArgument(14, static_cast<int>(a_conjugate));
+ kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
+ kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
+ kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
+
+ // Launches the kernel
+ auto global = std::vector<size_t>{global_size};
+ auto local = std::vector<size_t>{local_size};
+ RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================
diff --git a/src/routines/level2/xgemv.hpp b/src/routines/level2/xgemv.hpp
index e9afec8d..1e1fa726 100644
--- a/src/routines/level2/xgemv.hpp
+++ b/src/routines/level2/xgemv.hpp
@@ -28,25 +28,25 @@ class Xgemv: public Routine {
Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
// Templated-precision implementation of the routine
- StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoGemv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
// Generic version used also for other matrix-vector multiplications
- StatusCode MatVec(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- bool fast_kernel, bool fast_kernel_rot,
- const size_t parameter, const bool packed,
- const size_t kl, const size_t ku);
+ void MatVec(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ bool fast_kernel, bool fast_kernel_rot,
+ const size_t parameter, const bool packed,
+ const size_t kl, const size_t ku);
};
// =================================================================================================
diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp
index 29cffe0c..d16ebd11 100644
--- a/src/routines/level2/xger.cpp
+++ b/src/routines/level2/xger.cpp
@@ -22,26 +22,25 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xger.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xger<T>::DoGer(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xger<T>::DoGer(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Makes sure all dimensions are larger than zero
- if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
+ if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrix has an alternative layout (row or column-major).
const auto a_is_rowmajor = (layout == Layout::kRowMajor);
@@ -49,44 +48,35 @@ StatusCode Xger<T>::DoGer(const Layout layout,
const auto a_two = (a_is_rowmajor) ? m : n;
// Tests the matrix and the vectors for validity
- auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestVectorX(m, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(n, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+ TestVectorX(m, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
// Retrieves the kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, "Xger");
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(a_one));
- kernel.SetArgument(1, static_cast<int>(a_two));
- kernel.SetArgument(2, GetRealArg(alpha));
- kernel.SetArgument(3, x_buffer());
- kernel.SetArgument(4, static_cast<int>(x_offset));
- kernel.SetArgument(5, static_cast<int>(x_inc));
- kernel.SetArgument(6, y_buffer());
- kernel.SetArgument(7, static_cast<int>(y_offset));
- kernel.SetArgument(8, static_cast<int>(y_inc));
- kernel.SetArgument(9, a_buffer());
- kernel.SetArgument(10, static_cast<int>(a_offset));
- kernel.SetArgument(11, static_cast<int>(a_ld));
- kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
-
- // Launches the kernel
- auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
- auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
- auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
- auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, "Xger");
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(a_one));
+ kernel.SetArgument(1, static_cast<int>(a_two));
+ kernel.SetArgument(2, GetRealArg(alpha));
+ kernel.SetArgument(3, x_buffer());
+ kernel.SetArgument(4, static_cast<int>(x_offset));
+ kernel.SetArgument(5, static_cast<int>(x_inc));
+ kernel.SetArgument(6, y_buffer());
+ kernel.SetArgument(7, static_cast<int>(y_offset));
+ kernel.SetArgument(8, static_cast<int>(y_inc));
+ kernel.SetArgument(9, a_buffer());
+ kernel.SetArgument(10, static_cast<int>(a_offset));
+ kernel.SetArgument(11, static_cast<int>(a_ld));
+ kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
+
+ // Launches the kernel
+ auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
+ auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
+ auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
+ auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================
diff --git a/src/routines/level2/xger.hpp b/src/routines/level2/xger.hpp
index 3c6abe44..fbbb07a1 100644
--- a/src/routines/level2/xger.hpp
+++ b/src/routines/level2/xger.hpp
@@ -28,12 +28,12 @@ class Xger: public Routine {
Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
// Templated-precision implementation of the routine
- StatusCode DoGer(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+ void DoGer(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
diff --git a/src/routines/level2/xgerc.cpp b/src/routines/level2/xgerc.cpp
index d9feda97..4fa2e2a8 100644
--- a/src/routines/level2/xgerc.cpp
+++ b/src/routines/level2/xgerc.cpp
@@ -28,19 +28,19 @@ Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xgerc<T>::DoGerc(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xgerc<T>::DoGerc(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Regular Ger operation on complex data, plus conjugation in the kernel guarded by the
// ROUTINE_GERC guard.
- return DoGer(layout, m, n, alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld);
+ DoGer(layout, m, n, alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld);
}
// =================================================================================================
diff --git a/src/routines/level2/xgerc.hpp b/src/routines/level2/xgerc.hpp
index f1d04dfd..2d61f2b7 100644
--- a/src/routines/level2/xgerc.hpp
+++ b/src/routines/level2/xgerc.hpp
@@ -31,12 +31,12 @@ class Xgerc: public Xger<T> {
Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");
// Templated-precision implementation of the routine
- StatusCode DoGerc(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+ void DoGerc(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
diff --git a/src/routines/level2/xgeru.cpp b/src/routines/level2/xgeru.cpp
index da9e91c2..c77e69c5 100644
--- a/src/routines/level2/xgeru.cpp
+++ b/src/routines/level2/xgeru.cpp
@@ -28,18 +28,18 @@ Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xgeru<T>::DoGeru(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xgeru<T>::DoGeru(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Regular Ger operation on complex data
- return DoGer(layout, m, n, alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld);
+ DoGer(layout, m, n, alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld);
}
// =================================================================================================
diff --git a/src/routines/level2/xgeru.hpp b/src/routines/level2/xgeru.hpp
index fb50e917..4cae6b58 100644
--- a/src/routines/level2/xgeru.hpp
+++ b/src/routines/level2/xgeru.hpp
@@ -31,12 +31,12 @@ class Xgeru: public Xger<T> {
Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");
// Templated-precision implementation of the routine
- StatusCode DoGeru(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+ void DoGeru(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
diff --git a/src/routines/level2/xhbmv.cpp b/src/routines/level2/xhbmv.cpp
index f6c0e3c4..c7c9ed9d 100644
--- a/src/routines/level2/xhbmv.cpp
+++ b/src/routines/level2/xhbmv.cpp
@@ -29,13 +29,13 @@ Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -45,13 +45,13 @@ StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
// The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_HBMV define.
bool fast_kernels = false;
- return MatVec(layout, Transpose::kNo,
- n, n, alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- is_upper, false, k, 0);
+ MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, k, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xhbmv.hpp b/src/routines/level2/xhbmv.hpp
index d668eb88..76d3c91e 100644
--- a/src/routines/level2/xhbmv.hpp
+++ b/src/routines/level2/xhbmv.hpp
@@ -33,13 +33,13 @@ class Xhbmv: public Xgemv<T> {
Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");
// Templated-precision implementation of the routine
- StatusCode DoHbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoHbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xhemv.cpp b/src/routines/level2/xhemv.cpp
index 2cbcf7b4..209ff654 100644
--- a/src/routines/level2/xhemv.cpp
+++ b/src/routines/level2/xhemv.cpp
@@ -29,13 +29,13 @@ Xhemv<T>::Xhemv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -45,13 +45,13 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
// The specific hermitian matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_HEMV define.
bool fast_kernels = false;
- return MatVec(layout, Transpose::kNo,
- n, n, alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- is_upper, false, 0, 0);
+ MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, 0, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xhemv.hpp b/src/routines/level2/xhemv.hpp
index 8e062fd3..20d2df22 100644
--- a/src/routines/level2/xhemv.hpp
+++ b/src/routines/level2/xhemv.hpp
@@ -33,13 +33,13 @@ class Xhemv: public Xgemv<T> {
Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");
// Templated-precision implementation of the routine
- StatusCode DoHemv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoHemv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp
index 6dd95938..6c334e63 100644
--- a/src/routines/level2/xher.cpp
+++ b/src/routines/level2/xher.cpp
@@ -21,11 +21,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xher.opencl"
- ;
+ }) {
}
// =================================================================================================
@@ -41,15 +40,15 @@ template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; }
// The main routine
template <typename T, typename U>
-StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
- const size_t n,
- const U alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const bool packed) {
+void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const U alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const bool packed) {
// Makes sure the dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// The data is either in the upper or lower triangle
const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -57,47 +56,38 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
const auto is_rowmajor = (layout == Layout::kRowMajor);
// Tests the matrix and the vectors for validity
- auto status = StatusCode::kSuccess;
- if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
- else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
- if (ErrorIn(status)) { return status; }
- status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
+ if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
+ else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
// If alpha is zero an update is not required
- if (alpha == U{0}) { return StatusCode::kSuccess; }
+ if (alpha == U{0}) { return; }
// Creates a matching version of alpha
const auto matching_alpha = GetAlpha(alpha);
// Retrieves the kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, "Xher");
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, GetRealArg(matching_alpha));
- kernel.SetArgument(2, x_buffer());
- kernel.SetArgument(3, static_cast<int>(x_offset));
- kernel.SetArgument(4, static_cast<int>(x_inc));
- kernel.SetArgument(5, a_buffer());
- kernel.SetArgument(6, static_cast<int>(a_offset));
- kernel.SetArgument(7, static_cast<int>(a_ld));
- kernel.SetArgument(8, static_cast<int>(is_upper));
- kernel.SetArgument(9, static_cast<int>(is_rowmajor));
-
- // Launches the kernel
- auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
- auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
- auto global = std::vector<size_t>{global_one, global_two};
- auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, "Xher");
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, GetRealArg(matching_alpha));
+ kernel.SetArgument(2, x_buffer());
+ kernel.SetArgument(3, static_cast<int>(x_offset));
+ kernel.SetArgument(4, static_cast<int>(x_inc));
+ kernel.SetArgument(5, a_buffer());
+ kernel.SetArgument(6, static_cast<int>(a_offset));
+ kernel.SetArgument(7, static_cast<int>(a_ld));
+ kernel.SetArgument(8, static_cast<int>(is_upper));
+ kernel.SetArgument(9, static_cast<int>(is_rowmajor));
+
+ // Launches the kernel
+ auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
+ auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
+ auto global = std::vector<size_t>{global_one, global_two};
+ auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================
diff --git a/src/routines/level2/xher.hpp b/src/routines/level2/xher.hpp
index 9ff6bf3f..70a30bda 100644
--- a/src/routines/level2/xher.hpp
+++ b/src/routines/level2/xher.hpp
@@ -31,12 +31,12 @@ class Xher: public Routine {
T GetAlpha(const U alpha);
// Templated-precision implementation of the routine
- StatusCode DoHer(const Layout layout, const Triangle triangle,
- const size_t n,
- const U alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const bool packed = false);
+ void DoHer(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const U alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const bool packed = false);
};
// =================================================================================================
diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp
index 3d57a9b9..11e2c871 100644
--- a/src/routines/level2/xher2.cpp
+++ b/src/routines/level2/xher2.cpp
@@ -21,27 +21,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xher2.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const bool packed) {
+void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const bool packed) {
// Makes sure the dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// The data is either in the upper or lower triangle
const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -49,46 +48,36 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
const auto is_rowmajor = (layout == Layout::kRowMajor);
// Tests the matrix and the vectors for validity
- auto status = StatusCode::kSuccess;
- if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
- else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
- if (ErrorIn(status)) { return status; }
- status = TestVectorX(n, x_buffer, x_offset, x_inc);
- if (ErrorIn(status)) { return status; }
- status = TestVectorY(n, y_buffer, y_offset, y_inc);
- if (ErrorIn(status)) { return status; }
+ if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
+ else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
+ TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorY(n, y_buffer, y_offset, y_inc);
// Retrieves the kernel from the compiled binary
- try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, "Xher2");
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, GetRealArg(alpha));
- kernel.SetArgument(2, x_buffer());
- kernel.SetArgument(3, static_cast<int>(x_offset));
- kernel.SetArgument(4, static_cast<int>(x_inc));
- kernel.SetArgument(5, y_buffer());
- kernel.SetArgument(6, static_cast<int>(y_offset));
- kernel.SetArgument(7, static_cast<int>(y_inc));
- kernel.SetArgument(8, a_buffer());
- kernel.SetArgument(9, static_cast<int>(a_offset));
- kernel.SetArgument(10, static_cast<int>(a_ld));
- kernel.SetArgument(11, static_cast<int>(is_upper));
- kernel.SetArgument(12, static_cast<int>(is_rowmajor));
-
- // Launches the kernel
- auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
- auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
- auto global = std::vector<size_t>{global_one, global_two};
- auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
- status = RunKernel(kernel, queue_, device_, global, local, event_);
- if (ErrorIn(status)) { return status; }
-
- // Succesfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, "Xher2");
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, GetRealArg(alpha));
+ kernel.SetArgument(2, x_buffer());
+ kernel.SetArgument(3, static_cast<int>(x_offset));
+ kernel.SetArgument(4, static_cast<int>(x_inc));
+ kernel.SetArgument(5, y_buffer());
+ kernel.SetArgument(6, static_cast<int>(y_offset));
+ kernel.SetArgument(7, static_cast<int>(y_inc));
+ kernel.SetArgument(8, a_buffer());
+ kernel.SetArgument(9, static_cast<int>(a_offset));
+ kernel.SetArgument(10, static_cast<int>(a_ld));
+ kernel.SetArgument(11, static_cast<int>(is_upper));
+ kernel.SetArgument(12, static_cast<int>(is_rowmajor));
+
+ // Launches the kernel
+ auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
+ auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
+ auto global = std::vector<size_t>{global_one, global_two};
+ auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================
diff --git a/src/routines/level2/xher2.hpp b/src/routines/level2/xher2.hpp
index 8c53c047..dcb2ecb7 100644
--- a/src/routines/level2/xher2.hpp
+++ b/src/routines/level2/xher2.hpp
@@ -28,13 +28,13 @@ class Xher2: public Routine {
Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
// Templated-precision implementation of the routine
- StatusCode DoHer2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const bool packed = false);
+ void DoHer2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const bool packed = false);
};
// =================================================================================================
diff --git a/src/routines/level2/xhpmv.cpp b/src/routines/level2/xhpmv.cpp
index e6f82b34..70a0ab0d 100644
--- a/src/routines/level2/xhpmv.cpp
+++ b/src/routines/level2/xhpmv.cpp
@@ -29,13 +29,13 @@ Xhpmv<T>::Xhpmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -45,13 +45,13 @@ StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
// The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_HPMV define.
bool fast_kernels = false;
- return MatVec(layout, Transpose::kNo,
- n, n, alpha,
- ap_buffer, ap_offset, n,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- is_upper, true, 0, 0);
+ MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ ap_buffer, ap_offset, n,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, true, 0, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xhpmv.hpp b/src/routines/level2/xhpmv.hpp
index b11192f9..13a6277c 100644
--- a/src/routines/level2/xhpmv.hpp
+++ b/src/routines/level2/xhpmv.hpp
@@ -33,13 +33,13 @@ class Xhpmv: public Xgemv<T> {
Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");
// Templated-precision implementation of the routine
- StatusCode DoHpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoHpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xhpr.cpp b/src/routines/level2/xhpr.cpp
index 225ebfe5..7e517c59 100644
--- a/src/routines/level2/xhpr.cpp
+++ b/src/routines/level2/xhpr.cpp
@@ -28,17 +28,17 @@ Xhpr<T,U>::Xhpr(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T, typename U>
-StatusCode Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const U alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const U alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xhpr functionality is implemented in the kernel using defines
- return DoHer(layout, triangle, n, alpha,
- x_buffer, x_offset, x_inc,
- ap_buffer, ap_offset, n,
- true); // packed matrix
+ DoHer(layout, triangle, n, alpha,
+ x_buffer, x_offset, x_inc,
+ ap_buffer, ap_offset, n,
+ true); // packed matrix
}
// =================================================================================================
diff --git a/src/routines/level2/xhpr.hpp b/src/routines/level2/xhpr.hpp
index 37801c68..6ebc220e 100644
--- a/src/routines/level2/xhpr.hpp
+++ b/src/routines/level2/xhpr.hpp
@@ -31,11 +31,11 @@ class Xhpr: public Xher<T,U> {
Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");
// Templated-precision implementation of the routine
- StatusCode DoHpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const U alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset);
+ void DoHpr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const U alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================
diff --git a/src/routines/level2/xhpr2.cpp b/src/routines/level2/xhpr2.cpp
index 85f9d3f9..35daa365 100644
--- a/src/routines/level2/xhpr2.cpp
+++ b/src/routines/level2/xhpr2.cpp
@@ -28,19 +28,19 @@ Xhpr2<T>::Xhpr2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xhpr2 functionality is implemented in the kernel using defines
- return DoHer2(layout, triangle, n, alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- ap_buffer, ap_offset, n,
- true); // packed matrix
+ DoHer2(layout, triangle, n, alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ ap_buffer, ap_offset, n,
+ true); // packed matrix
}
// =================================================================================================
diff --git a/src/routines/level2/xhpr2.hpp b/src/routines/level2/xhpr2.hpp
index d66dce55..f344fd48 100644
--- a/src/routines/level2/xhpr2.hpp
+++ b/src/routines/level2/xhpr2.hpp
@@ -31,12 +31,12 @@ class Xhpr2: public Xher2<T> {
Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");
// Templated-precision implementation of the routine
- StatusCode DoHpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset);
+ void DoHpr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================
diff --git a/src/routines/level2/xsbmv.cpp b/src/routines/level2/xsbmv.cpp
index 28730899..e47430d1 100644
--- a/src/routines/level2/xsbmv.cpp
+++ b/src/routines/level2/xsbmv.cpp
@@ -29,13 +29,13 @@ Xsbmv<T>::Xsbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -45,13 +45,13 @@ StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
// The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_SBMV define.
bool fast_kernels = false;
- return MatVec(layout, Transpose::kNo,
- n, n, alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- is_upper, false, k, 0);
+ MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, k, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xsbmv.hpp b/src/routines/level2/xsbmv.hpp
index 16c5e9a8..a4542f49 100644
--- a/src/routines/level2/xsbmv.hpp
+++ b/src/routines/level2/xsbmv.hpp
@@ -33,13 +33,13 @@ class Xsbmv: public Xgemv<T> {
Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");
// Templated-precision implementation of the routine
- StatusCode DoSbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoSbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xspmv.cpp b/src/routines/level2/xspmv.cpp
index f6651012..bf1a49e1 100644
--- a/src/routines/level2/xspmv.cpp
+++ b/src/routines/level2/xspmv.cpp
@@ -29,13 +29,13 @@ Xspmv<T>::Xspmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -45,13 +45,13 @@ StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
// The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_SPMV define.
bool fast_kernels = false;
- return MatVec(layout, Transpose::kNo,
- n, n, alpha,
- ap_buffer, ap_offset, n,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- is_upper, true, 0, 0);
+ MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ ap_buffer, ap_offset, n,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, true, 0, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xspmv.hpp b/src/routines/level2/xspmv.hpp
index a0c69b85..94caa4ac 100644
--- a/src/routines/level2/xspmv.hpp
+++ b/src/routines/level2/xspmv.hpp
@@ -33,13 +33,13 @@ class Xspmv: public Xgemv<T> {
Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");
// Templated-precision implementation of the routine
- StatusCode DoSpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoSpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xspr.cpp b/src/routines/level2/xspr.cpp
index a75fe9c3..56791a7b 100644
--- a/src/routines/level2/xspr.cpp
+++ b/src/routines/level2/xspr.cpp
@@ -28,17 +28,17 @@ Xspr<T>::Xspr(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xspr functionality is implemented in the kernel using defines
- return DoHer(layout, triangle, n, alpha,
- x_buffer, x_offset, x_inc,
- ap_buffer, ap_offset, n,
- true); // packed matrix
+ DoHer(layout, triangle, n, alpha,
+ x_buffer, x_offset, x_inc,
+ ap_buffer, ap_offset, n,
+ true); // packed matrix
}
// =================================================================================================
diff --git a/src/routines/level2/xspr.hpp b/src/routines/level2/xspr.hpp
index 6468c736..760a2ddb 100644
--- a/src/routines/level2/xspr.hpp
+++ b/src/routines/level2/xspr.hpp
@@ -31,11 +31,11 @@ class Xspr: public Xher<T,T> {
Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");
// Templated-precision implementation of the routine
- StatusCode DoSpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset);
+ void DoSpr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================
diff --git a/src/routines/level2/xspr2.cpp b/src/routines/level2/xspr2.cpp
index c39a2eb4..8d0432c2 100644
--- a/src/routines/level2/xspr2.cpp
+++ b/src/routines/level2/xspr2.cpp
@@ -28,19 +28,19 @@ Xspr2<T>::Xspr2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xspr2 functionality is implemented in the kernel using defines
- return DoHer2(layout, triangle, n, alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- ap_buffer, ap_offset, n,
- true); // packed matrix
+ DoHer2(layout, triangle, n, alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ ap_buffer, ap_offset, n,
+ true); // packed matrix
}
// =================================================================================================
diff --git a/src/routines/level2/xspr2.hpp b/src/routines/level2/xspr2.hpp
index 693c56a1..9f03f768 100644
--- a/src/routines/level2/xspr2.hpp
+++ b/src/routines/level2/xspr2.hpp
@@ -31,12 +31,12 @@ class Xspr2: public Xher2<T> {
Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");
// Templated-precision implementation of the routine
- StatusCode DoSpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset);
+ void DoSpr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================
diff --git a/src/routines/level2/xsymv.cpp b/src/routines/level2/xsymv.cpp
index 648d2a3e..86bb66b8 100644
--- a/src/routines/level2/xsymv.cpp
+++ b/src/routines/level2/xsymv.cpp
@@ -29,13 +29,13 @@ Xsymv<T>::Xsymv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -45,13 +45,13 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
// The specific symmetric matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_SYMV define.
bool fast_kernels = false;
- return MatVec(layout, Transpose::kNo,
- n, n, alpha,
- a_buffer, a_offset, a_ld,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc,
- fast_kernels, fast_kernels,
- is_upper, false, 0, 0);
+ MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, 0, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xsymv.hpp b/src/routines/level2/xsymv.hpp
index 67815f2f..3945802f 100644
--- a/src/routines/level2/xsymv.hpp
+++ b/src/routines/level2/xsymv.hpp
@@ -33,13 +33,13 @@ class Xsymv: public Xgemv<T> {
Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV");
// Templated-precision implementation of the routine
- StatusCode DoSymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ void DoSymv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xsyr.cpp b/src/routines/level2/xsyr.cpp
index 758d8f8f..64c2dc74 100644
--- a/src/routines/level2/xsyr.cpp
+++ b/src/routines/level2/xsyr.cpp
@@ -28,16 +28,16 @@ Xsyr<T>::Xsyr(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Specific Xsyr functionality is implemented in the kernel using defines
- return DoHer(layout, triangle, n, alpha,
- x_buffer, x_offset, x_inc,
- a_buffer, a_offset, a_ld);
+ DoHer(layout, triangle, n, alpha,
+ x_buffer, x_offset, x_inc,
+ a_buffer, a_offset, a_ld);
}
// =================================================================================================
diff --git a/src/routines/level2/xsyr.hpp b/src/routines/level2/xsyr.hpp
index 20393454..a23ff80f 100644
--- a/src/routines/level2/xsyr.hpp
+++ b/src/routines/level2/xsyr.hpp
@@ -31,11 +31,11 @@ class Xsyr: public Xher<T,T> {
Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR");
// Templated-precision implementation of the routine
- StatusCode DoSyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+ void DoSyr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
diff --git a/src/routines/level2/xsyr2.cpp b/src/routines/level2/xsyr2.cpp
index 6f43b219..38ca9d69 100644
--- a/src/routines/level2/xsyr2.cpp
+++ b/src/routines/level2/xsyr2.cpp
@@ -28,18 +28,18 @@ Xsyr2<T>::Xsyr2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Specific Xsyr2 functionality is implemented in the kernel using defines
- return DoHer2(layout, triangle, n, alpha,
- x_buffer, x_offset, x_inc,
- y_buffer, y_offset, y_inc,
- a_buffer, a_offset, a_ld);
+ DoHer2(layout, triangle, n, alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld);
}
// =================================================================================================
diff --git a/src/routines/level2/xsyr2.hpp b/src/routines/level2/xsyr2.hpp
index 1a8dcbe8..5a8d8eb4 100644
--- a/src/routines/level2/xsyr2.hpp
+++ b/src/routines/level2/xsyr2.hpp
@@ -31,12 +31,12 @@ class Xsyr2: public Xher2<T> {
Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2");
// Templated-precision implementation of the routine
- StatusCode DoSyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+ void DoSyr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
diff --git a/src/routines/level2/xtbmv.cpp b/src/routines/level2/xtbmv.cpp
index e315c544..f4a58ed2 100644
--- a/src/routines/level2/xtbmv.cpp
+++ b/src/routines/level2/xtbmv.cpp
@@ -29,17 +29,15 @@ Xtbmv<T>::Xtbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Creates a copy of X: a temporary scratch buffer
auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
- try {
- x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
- } catch (...) { } // Continues: error-code is returned in MatVec
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -52,20 +50,22 @@ StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
// The specific triangular banded matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_TBMV define.
auto fast_kernels = false;
- auto status = MatVec(layout, a_transpose,
- n, n, static_cast<T>(1),
- a_buffer, a_offset, a_ld,
- scratch_buffer, x_offset, x_inc, static_cast<T>(0),
- x_buffer, x_offset, x_inc,
- fast_kernels, fast_kernels,
- parameter, false, k, 0);
-
- // Returns the proper error code (renames vector Y to X)
- switch(status) {
- case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
- case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
- case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
- default: return status;
+ try {
+ MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ a_buffer, a_offset, a_ld,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, false, k, 0);
+ } catch (BLASError &e) {
+ // Returns the proper error code (renames vector Y to X)
+ switch (e.status()) {
+ case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details());
+ case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details());
+ case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details());
+ default: throw;
+ }
}
}
diff --git a/src/routines/level2/xtbmv.hpp b/src/routines/level2/xtbmv.hpp
index 389e9705..abd12db6 100644
--- a/src/routines/level2/xtbmv.hpp
+++ b/src/routines/level2/xtbmv.hpp
@@ -35,11 +35,11 @@ class Xtbmv: public Xgemv<T> {
Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV");
// Templated-precision implementation of the routine
- StatusCode DoTbmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoTbmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xtpmv.cpp b/src/routines/level2/xtpmv.cpp
index 46811089..c0d26699 100644
--- a/src/routines/level2/xtpmv.cpp
+++ b/src/routines/level2/xtpmv.cpp
@@ -29,17 +29,15 @@ Xtpmv<T>::Xtpmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Creates a copy of X: a temporary scratch buffer
auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
- try {
- x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
- } catch (...) { } // Continues: error-code is returned in MatVec
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -52,20 +50,22 @@ StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
// The specific triangular packed matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_TPMV define.
auto fast_kernels = false;
- auto status = MatVec(layout, a_transpose,
- n, n, static_cast<T>(1),
- ap_buffer, ap_offset, n,
- scratch_buffer, x_offset, x_inc, static_cast<T>(0),
- x_buffer, x_offset, x_inc,
- fast_kernels, fast_kernels,
- parameter, true, 0, 0);
-
- // Returns the proper error code (renames vector Y to X)
- switch(status) {
- case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
- case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
- case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
- default: return status;
+ try {
+ MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ ap_buffer, ap_offset, n,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, true, 0, 0);
+ } catch (BLASError &e) {
+ // Returns the proper error code (renames vector Y to X)
+ switch (e.status()) {
+ case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details());
+ case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details());
+ case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details());
+ default: throw;
+ }
}
}
diff --git a/src/routines/level2/xtpmv.hpp b/src/routines/level2/xtpmv.hpp
index 0e8cf1d2..5b3954e8 100644
--- a/src/routines/level2/xtpmv.hpp
+++ b/src/routines/level2/xtpmv.hpp
@@ -35,11 +35,11 @@ class Xtpmv: public Xgemv<T> {
Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV");
// Templated-precision implementation of the routine
- StatusCode DoTpmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoTpmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level2/xtrmv.cpp b/src/routines/level2/xtrmv.cpp
index d2f24252..5fff9b31 100644
--- a/src/routines/level2/xtrmv.cpp
+++ b/src/routines/level2/xtrmv.cpp
@@ -29,17 +29,15 @@ Xtrmv<T>::Xtrmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Creates a copy of X: a temporary scratch buffer
auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
- try {
- x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
- } catch (...) { } // Continues: error-code is returned in MatVec
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@@ -52,20 +50,22 @@ StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
// The specific triangular matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_TRMV define.
auto fast_kernels = false;
- auto status = MatVec(layout, a_transpose,
- n, n, static_cast<T>(1),
- a_buffer, a_offset, a_ld,
- scratch_buffer, x_offset, x_inc, static_cast<T>(0),
- x_buffer, x_offset, x_inc,
- fast_kernels, fast_kernels,
- parameter, false, 0, 0);
-
- // Returns the proper error code (renames vector Y to X)
- switch(status) {
- case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
- case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
- case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
- default: return status;
+ try {
+ MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ a_buffer, a_offset, a_ld,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, false, 0, 0);
+ } catch (BLASError &e) {
+ // Returns the proper error code (renames vector Y to X)
+ switch (e.status()) {
+ case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details());
+ case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details());
+ case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details());
+ default: throw;
+ }
}
}
diff --git a/src/routines/level2/xtrmv.hpp b/src/routines/level2/xtrmv.hpp
index 07dd7841..b028ee68 100644
--- a/src/routines/level2/xtrmv.hpp
+++ b/src/routines/level2/xtrmv.hpp
@@ -35,11 +35,11 @@ class Xtrmv: public Xgemv<T> {
Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV");
// Templated-precision implementation of the routine
- StatusCode DoTrmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+ void DoTrmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 1602c69f..4f70dc7a 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -24,8 +24,7 @@ template <typename T>
Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name,
{"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
- PrecisionValue<T>()) {
- source_string_ =
+ PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@@ -37,30 +36,28 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_direct_part1.opencl"
#include "../../kernels/level3/xgemm_direct_part2.opencl"
#include "../../kernels/level3/xgemm_direct_part3.opencl"
- ;
- auto source_string_part_2 = // separated in two parts to prevent C1091 in MSVC 2013
+ , // separated in two parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
- ;
- source_string_ += source_string_part_2;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xgemm<T>::DoGemm(const Layout layout,
- const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
+void Xgemm<T>::DoGemm(const Layout layout,
+ const Transpose a_transpose, const Transpose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; }
+ if ((m == 0) || (n == 0) || (k == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed. Note
@@ -99,12 +96,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// matrix A cannot be less than K when rotated, or less than M when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N when rotated, or less than M when not-rotated
- auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+ TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
+ TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
// Selects which version of GEMM to run
const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
@@ -131,7 +125,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// requirements, but several pre and post-processing kernels take care of those. However, the
// overhead of these extra kernels might not be ideal for certain devices/arguments.
template <typename T>
-StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
+void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
@@ -142,8 +136,6 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k
const size_t a_one, const size_t a_two, const bool a_want_rotated,
const size_t b_one, const size_t b_two, const bool b_want_rotated,
const size_t c_one, const size_t c_two, const bool c_want_rotated) {
- auto status = StatusCode::kSuccess;
-
// Calculates the ceiled versions of m, n, and k
const auto m_ceiled = Ceil(m, db_["MWG"]);
const auto n_ceiled = Ceil(n, db_["NWG"]);
@@ -158,109 +150,95 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k
const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled;
const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled;
- // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
- try {
-
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
- // Determines whether or not temporary matrices are needed
- auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
- a_do_transpose == false && a_conjugate == false;
- auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 &&
- b_do_transpose == false && b_conjugate == false;
- auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
- c_do_transpose == false;
-
- // Creates the temporary matrices
- const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
- const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
- const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);
-
- // Events of all kernels (including pre/post processing kernels)
- auto eventWaitList = std::vector<Event>();
- auto emptyEventList = std::vector<Event>();
-
- // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
- // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
- // case nothing has to be done, these kernels can be skipped.
- if (!a_no_temp) {
- auto eventProcessA = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
- a_one, a_two, a_ld, a_offset, a_buffer,
- a_one_i, a_two_i, a_one_i, 0, a_temp,
- ConstantOne<T>(), program,
- true, a_do_transpose, a_conjugate);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessA);
- }
-
- // As above, but now for matrix B
- if (!b_no_temp) {
- auto eventProcessB = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
- b_one, b_two, b_ld, b_offset, b_buffer,
- b_one_i, b_two_i, b_one_i, 0, b_temp,
- ConstantOne<T>(), program,
- true, b_do_transpose, b_conjugate);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessB);
- }
-
- // As above, but now for matrix C. This is only necessary if C is used both as input and output.
- if (!c_no_temp && beta != static_cast<T>(0)) {
- auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
- c_one, c_two, c_ld, c_offset, c_buffer,
- c_one_i, c_two_i, c_one_i, 0, c_temp,
- ConstantOne<T>(), program,
- true, c_do_transpose, false);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessC);
- }
-
- // Retrieves the Xgemm kernel from the compiled binary
- try {
- auto kernel = Kernel(program, "Xgemm");
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(m_ceiled));
- kernel.SetArgument(1, static_cast<int>(n_ceiled));
- kernel.SetArgument(2, static_cast<int>(k_ceiled));
- kernel.SetArgument(3, GetRealArg(alpha));
- kernel.SetArgument(4, GetRealArg(beta));
- kernel.SetArgument(5, a_temp());
- kernel.SetArgument(6, b_temp());
- kernel.SetArgument(7, c_temp());
-
- // Computes the global and local thread sizes
- const auto global = std::vector<size_t>{
- (c_one_i * db_["MDIMC"]) / db_["MWG"],
- (c_two_i * db_["NDIMC"]) / db_["NWG"]
- };
- const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
- // Launches the kernel
- auto eventKernel = Event();
- auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
- status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
- if (ErrorIn(status)) { return status; }
-
- // Runs the post-processing kernel if needed
- if (!c_no_temp) {
- eventWaitList.push_back(eventKernel);
- status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
- c_one_i, c_two_i, c_one_i, 0, c_temp,
- c_one, c_two, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
- false, c_do_transpose, false);
- if (ErrorIn(status)) { return status; }
- }
-
- // Successfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // Loads the program from the database
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+ // Determines whether or not temporary matrices are needed
+ auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
+ a_do_transpose == false && a_conjugate == false;
+ auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 &&
+ b_do_transpose == false && b_conjugate == false;
+ auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
+ c_do_transpose == false;
+
+ // Creates the temporary matrices
+ const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
+ const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
+ const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);
+
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
+ // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+ // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+ // case nothing has to be done, these kernels can be skipped.
+ if (!a_no_temp) {
+ auto eventProcessA = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
+ a_one_i, a_two_i, a_one_i, 0, a_temp,
+ ConstantOne<T>(), program,
+ true, a_do_transpose, a_conjugate);
+ eventWaitList.push_back(eventProcessA);
+ }
+
+ // As above, but now for matrix B
+ if (!b_no_temp) {
+ auto eventProcessB = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
+ b_one, b_two, b_ld, b_offset, b_buffer,
+ b_one_i, b_two_i, b_one_i, 0, b_temp,
+ ConstantOne<T>(), program,
+ true, b_do_transpose, b_conjugate);
+ eventWaitList.push_back(eventProcessB);
+ }
+
+ // As above, but now for matrix C. This is only necessary if C is used both as input and output.
+ if (!c_no_temp && beta != static_cast<T>(0)) {
+ auto eventProcessC = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+ c_one, c_two, c_ld, c_offset, c_buffer,
+ c_one_i, c_two_i, c_one_i, 0, c_temp,
+ ConstantOne<T>(), program,
+ true, c_do_transpose, false);
+ eventWaitList.push_back(eventProcessC);
+ }
+
+ // Retrieves the Xgemm kernel from the compiled binary
+ auto kernel = Kernel(program, "Xgemm");
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(m_ceiled));
+ kernel.SetArgument(1, static_cast<int>(n_ceiled));
+ kernel.SetArgument(2, static_cast<int>(k_ceiled));
+ kernel.SetArgument(3, GetRealArg(alpha));
+ kernel.SetArgument(4, GetRealArg(beta));
+ kernel.SetArgument(5, a_temp());
+ kernel.SetArgument(6, b_temp());
+ kernel.SetArgument(7, c_temp());
+
+ // Computes the global and local thread sizes
+ const auto global = std::vector<size_t>{
+ (c_one_i * db_["MDIMC"]) / db_["MWG"],
+ (c_two_i * db_["NDIMC"]) / db_["NWG"]
+ };
+ const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+ // Launches the kernel
+ auto eventKernel = Event();
+ auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
+ RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
+
+ // Runs the post-processing kernel if needed
+ if (!c_no_temp) {
+ eventWaitList.push_back(eventKernel);
+ PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+ c_one_i, c_two_i, c_one_i, 0, c_temp,
+ c_one, c_two, c_ld, c_offset, c_buffer,
+ ConstantOne<T>(), program,
+ false, c_do_transpose, false);
+ }
}
@@ -268,7 +246,7 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k
// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
template <typename T>
-StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
+void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
@@ -281,46 +259,40 @@ StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Retrieves the proper XgemmDirect kernel from the compiled binary
- try {
- const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
- (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
- auto kernel = Kernel(program, name);
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(m));
- kernel.SetArgument(1, static_cast<int>(n));
- kernel.SetArgument(2, static_cast<int>(k));
- kernel.SetArgument(3, GetRealArg(alpha));
- kernel.SetArgument(4, GetRealArg(beta));
- kernel.SetArgument(5, a_buffer());
- kernel.SetArgument(6, static_cast<int>(a_offset));
- kernel.SetArgument(7, static_cast<int>(a_ld));
- kernel.SetArgument(8, b_buffer());
- kernel.SetArgument(9, static_cast<int>(b_offset));
- kernel.SetArgument(10, static_cast<int>(b_ld));
- kernel.SetArgument(11, c_buffer());
- kernel.SetArgument(12, static_cast<int>(c_offset));
- kernel.SetArgument(13, static_cast<int>(c_ld));
- kernel.SetArgument(14, static_cast<int>(c_do_transpose));
- kernel.SetArgument(15, static_cast<int>(a_conjugate));
- kernel.SetArgument(16, static_cast<int>(b_conjugate));
-
- // Computes the global and local thread sizes
- const auto m_ceiled = Ceil(m, db_["WGD"]);
- const auto n_ceiled = Ceil(n, db_["WGD"]);
- const auto global = std::vector<size_t>{
- (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
- (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
- };
- const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
-
- // Launches the kernel
- auto status = RunKernel(kernel, queue_, device_, global, local, event_);
- if (ErrorIn(status)) { return status; }
-
- // Successfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
+ const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
+ (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
+ auto kernel = Kernel(program, name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(m));
+ kernel.SetArgument(1, static_cast<int>(n));
+ kernel.SetArgument(2, static_cast<int>(k));
+ kernel.SetArgument(3, GetRealArg(alpha));
+ kernel.SetArgument(4, GetRealArg(beta));
+ kernel.SetArgument(5, a_buffer());
+ kernel.SetArgument(6, static_cast<int>(a_offset));
+ kernel.SetArgument(7, static_cast<int>(a_ld));
+ kernel.SetArgument(8, b_buffer());
+ kernel.SetArgument(9, static_cast<int>(b_offset));
+ kernel.SetArgument(10, static_cast<int>(b_ld));
+ kernel.SetArgument(11, c_buffer());
+ kernel.SetArgument(12, static_cast<int>(c_offset));
+ kernel.SetArgument(13, static_cast<int>(c_ld));
+ kernel.SetArgument(14, static_cast<int>(c_do_transpose));
+ kernel.SetArgument(15, static_cast<int>(a_conjugate));
+ kernel.SetArgument(16, static_cast<int>(b_conjugate));
+
+ // Computes the global and local thread sizes
+ const auto m_ceiled = Ceil(m, db_["WGD"]);
+ const auto n_ceiled = Ceil(n, db_["WGD"]);
+ const auto global = std::vector<size_t>{
+ (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
+ (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
+ };
+ const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
+
+ // Launches the kernel
+ RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================
diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp
index 46e12453..c61611b6 100644
--- a/src/routines/level3/xgemm.hpp
+++ b/src/routines/level3/xgemm.hpp
@@ -28,36 +28,36 @@ class Xgemm: public Routine {
Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");
// Templated-precision implementation of the routine
- StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
+ void DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ // Indirect version of GEMM (with pre and post-processing kernels)
+ void GemmIndirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-
- // Indirect version of GEMM (with pre and post-processing kernels)
- StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
- const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
- const bool a_conjugate, const bool b_conjugate,
- const size_t a_one, const size_t a_two, const bool a_want_rotated,
- const size_t b_one, const size_t b_two, const bool b_want_rotated,
- const size_t c_one, const size_t c_two, const bool c_want_rotated);
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate,
+ const size_t a_one, const size_t a_two, const bool a_want_rotated,
+ const size_t b_one, const size_t b_two, const bool b_want_rotated,
+ const size_t c_one, const size_t c_two, const bool c_want_rotated);
// Direct version of GEMM (no pre and post-processing kernels)
- StatusCode GemmDirect(const size_t m, const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
- const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
- const bool a_conjugate, const bool b_conjugate);
+ void GemmDirect(const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate);
};
// =================================================================================================
diff --git a/src/routines/level3/xhemm.cpp b/src/routines/level3/xhemm.cpp
index 9813503e..e5b1502a 100644
--- a/src/routines/level3/xhemm.cpp
+++ b/src/routines/level3/xhemm.cpp
@@ -29,7 +29,7 @@ Xhemm<T>::Xhemm(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
+void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@@ -38,15 +38,14 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+ if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
// left) or B (on the right) in the Xgemm routine.
auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the squared A matrix
- auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(k, k, a_buffer, a_offset, a_ld);
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
// default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
@@ -55,73 +54,68 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
// Temporary buffer for a copy of the hermitian matrix
- try {
- auto temp_herm = Buffer<T>(context_, k*k);
-
- // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
- // routine afterwards
+ auto temp_herm = Buffer<T>(context_, k*k);
+
+ // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
+ // routine afterwards
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the arguments for the hermitian-to-squared kernel
+ kernel.SetArgument(0, static_cast<int>(k));
+ kernel.SetArgument(1, static_cast<int>(a_ld));
+ kernel.SetArgument(2, static_cast<int>(a_offset));
+ kernel.SetArgument(3, a_buffer());
+ kernel.SetArgument(4, static_cast<int>(k));
+ kernel.SetArgument(5, static_cast<int>(k));
+ kernel.SetArgument(6, static_cast<int>(0));
+ kernel.SetArgument(7, temp_herm());
+
+ // Uses the common padding kernel's thread configuration. This is allowed, since the
+ // hermitian-to-squared kernel uses the same parameters.
+ auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+ Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+ auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
+
+ // Synchronize now: 'DoGemm' does not accept a list of events to wait for
+ kernelEvent.WaitForCompletion();
+
+ // Runs the regular Xgemm code with either "C := AB+C" or ...
+ if (side == Side::kLeft) {
+ DoGemm(layout, Transpose::kNo, Transpose::kNo,
+ m, n, k,
+ alpha,
+ temp_herm, 0, k,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld);
+ }
+
+ // ... with "C := BA+C". Note that A and B are now reversed.
+ else {
try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the arguments for the hermitian-to-squared kernel
- kernel.SetArgument(0, static_cast<int>(k));
- kernel.SetArgument(1, static_cast<int>(a_ld));
- kernel.SetArgument(2, static_cast<int>(a_offset));
- kernel.SetArgument(3, a_buffer());
- kernel.SetArgument(4, static_cast<int>(k));
- kernel.SetArgument(5, static_cast<int>(k));
- kernel.SetArgument(6, static_cast<int>(0));
- kernel.SetArgument(7, temp_herm());
-
- // Uses the common padding kernel's thread configuration. This is allowed, since the
- // hermitian-to-squared kernel uses the same parameters.
- auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
- Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
- auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
-
- // Synchronize now: 'DoGemm' does not accept a list of events to wait for
- kernelEvent.WaitForCompletion();
-
- // Runs the regular Xgemm code with either "C := AB+C" or ...
- if (side == Side::kLeft) {
- status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
- m, n, k,
- alpha,
- temp_herm, 0, k,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld);
- }
-
- // ... with "C := BA+C". Note that A and B are now reversed.
- else {
- status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
- m, n, k,
- alpha,
- b_buffer, b_offset, b_ld,
- temp_herm, 0, k,
- beta,
- c_buffer, c_offset, c_ld);
-
- // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
- switch(status) {
- case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break;
- case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break;
- case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break;
- case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break;
- case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
- case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
- }
+ DoGemm(layout, Transpose::kNo, Transpose::kNo,
+ m, n, k,
+ alpha,
+ b_buffer, b_offset, b_ld,
+ temp_herm, 0, k,
+ beta,
+ c_buffer, c_offset, c_ld);
+ } catch (BLASError &e) {
+ // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+ switch(e.status()) {
+ case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details());
+ case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details());
+ case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details());
+ case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details());
+ case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details());
+ case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details());
+ default: throw;
}
-
- // Return the status of the Xgemm routine
- return status;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ }
+ }
}
// =================================================================================================
diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp
index 272bd2ec..2385706e 100644
--- a/src/routines/level3/xhemm.hpp
+++ b/src/routines/level3/xhemm.hpp
@@ -37,13 +37,13 @@ class Xhemm: public Xgemm<T> {
Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM");
// Templated-precision implementation of the routine
- StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+ void DoHemm(const Layout layout, const Side side, const Triangle triangle,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp
index bf328729..ee3bb8b8 100644
--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@@ -22,8 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@@ -32,23 +31,23 @@ Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T, typename U>
-StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const U beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
+void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const U beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+ if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
// to matrix A (argument: conjugate transpose)
@@ -71,12 +70,9 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
- auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
+ TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
+ TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]);
@@ -85,145 +81,128 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
- // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
- try {
-
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
- // Determines whether or not temporary matrices are needed
- auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
- ab_rotated == false && ab_conjugate == false;
- auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
- ab_rotated == false && ab_conjugate == true;
- auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
- ab_rotated == false && ab_conjugate == false;
- auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
- ab_rotated == false && ab_conjugate == true;
-
- // Creates the temporary matrices
- auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
-
- // Convert the arguments to complex versions
- auto complex_beta = T{beta, static_cast<U>(0.0)};
-
- // Events of all kernels (including pre/post processing kernels)
- auto eventWaitList = std::vector<Event>();
- auto emptyEventList = std::vector<Event>();
-
- // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
- // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
- // case nothing has to be done, these kernels can be skipped.
- if (!a1_no_temp) {
- auto eventProcessA1 = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
- ab_one, ab_two, a_ld, a_offset, a_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
- ConstantOne<T>(), program,
- true, ab_rotated, ab_conjugate);
- eventWaitList.push_back(eventProcessA1);
- if (ErrorIn(status)) { return status; }
- }
- if (!a2_no_temp) {
- auto eventProcessA2 = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
- ab_one, ab_two, a_ld, a_offset, a_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
- ConstantOne<T>(), program,
- true, ab_rotated, !ab_conjugate);
- eventWaitList.push_back(eventProcessA2);
- if (ErrorIn(status)) { return status; }
- }
- if (!b1_no_temp) {
- auto eventProcessB1 = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
- ab_one, ab_two, b_ld, b_offset, b_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
- ConstantOne<T>(), program,
- true, ab_rotated, ab_conjugate);
- eventWaitList.push_back(eventProcessB1);
- if (ErrorIn(status)) { return status; }
- }
- if (!b2_no_temp) {
- auto eventProcessB2 = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
- ab_one, ab_two, b_ld, b_offset, b_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
- ConstantOne<T>(), program,
- true, ab_rotated, !ab_conjugate);
- eventWaitList.push_back(eventProcessB2);
- if (ErrorIn(status)) { return status; }
- }
-
- // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
- // modify the other triangle.
- auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
- n, n, c_ld, c_offset, c_buffer,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- ConstantOne<T>(), program,
- true, c_rotated, false);
- eventWaitList.push_back(eventProcessC);
- if (ErrorIn(status)) { return status; }
-
- // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
- try {
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(n_ceiled));
- kernel.SetArgument(1, static_cast<int>(k_ceiled));
- kernel.SetArgument(2, GetRealArg(alpha));
- kernel.SetArgument(3, GetRealArg(complex_beta));
- kernel.SetArgument(4, a1_temp());
- kernel.SetArgument(5, b2_temp());
- kernel.SetArgument(6, c_temp());
-
- // Computes the global and local thread sizes
- auto global = std::vector<size_t>{
- (n_ceiled * db_["MDIMC"]) / db_["MWG"],
- (n_ceiled * db_["NDIMC"]) / db_["NWG"]
- };
- auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
- // Launches the kernel
- auto eventKernel1 = Event();
- status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel1);
-
- // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
- auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
- auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
- kernel.SetArgument(2, GetRealArg(conjugate_alpha));
- kernel.SetArgument(3, GetRealArg(complex_one));
- kernel.SetArgument(4, b1_temp());
- kernel.SetArgument(5, a2_temp());
-
- // Runs the kernel again
- auto eventKernel2 = Event();
- status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel2);
-
- // Runs the post-processing kernel
- auto upper = (triangle == Triangle::kUpper);
- auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- n, n, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
- false, c_rotated, false, upper, lower, true);
- if (ErrorIn(status)) { return status; }
-
- // Successfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // Loads the program from the database
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+ // Determines whether or not temporary matrices are needed
+ auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+ ab_rotated == false && ab_conjugate == false;
+ auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+ ab_rotated == false && ab_conjugate == true;
+ auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+ ab_rotated == false && ab_conjugate == false;
+ auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+ ab_rotated == false && ab_conjugate == true;
+
+ // Creates the temporary matrices
+ auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+
+ // Convert the arguments to complex versions
+ auto complex_beta = T{beta, static_cast<U>(0.0)};
+
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
+ // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+ // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+ // case nothing has to be done, these kernels can be skipped.
+ if (!a1_no_temp) {
+ auto eventProcessA1 = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
+ ab_one, ab_two, a_ld, a_offset, a_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
+ ConstantOne<T>(), program,
+ true, ab_rotated, ab_conjugate);
+ eventWaitList.push_back(eventProcessA1);
+ }
+ if (!a2_no_temp) {
+ auto eventProcessA2 = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
+ ab_one, ab_two, a_ld, a_offset, a_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
+ ConstantOne<T>(), program,
+ true, ab_rotated, !ab_conjugate);
+ eventWaitList.push_back(eventProcessA2);
+ }
+ if (!b1_no_temp) {
+ auto eventProcessB1 = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
+ ab_one, ab_two, b_ld, b_offset, b_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
+ ConstantOne<T>(), program,
+ true, ab_rotated, ab_conjugate);
+ eventWaitList.push_back(eventProcessB1);
+ }
+ if (!b2_no_temp) {
+ auto eventProcessB2 = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
+ ab_one, ab_two, b_ld, b_offset, b_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
+ ConstantOne<T>(), program,
+ true, ab_rotated, !ab_conjugate);
+ eventWaitList.push_back(eventProcessB2);
+ }
+
+ // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+ // modify the other triangle.
+ auto eventProcessC = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+ n, n, c_ld, c_offset, c_buffer,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
+ eventWaitList.push_back(eventProcessC);
+
+ // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n_ceiled));
+ kernel.SetArgument(1, static_cast<int>(k_ceiled));
+ kernel.SetArgument(2, GetRealArg(alpha));
+ kernel.SetArgument(3, GetRealArg(complex_beta));
+ kernel.SetArgument(4, a1_temp());
+ kernel.SetArgument(5, b2_temp());
+ kernel.SetArgument(6, c_temp());
+
+ // Computes the global and local thread sizes
+ auto global = std::vector<size_t>{
+ (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+ (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+ };
+ auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+ // Launches the kernel
+ auto eventKernel1 = Event();
+ RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
+ eventWaitList.push_back(eventKernel1);
+
+ // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
+ auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
+ auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
+ kernel.SetArgument(2, GetRealArg(conjugate_alpha));
+ kernel.SetArgument(3, GetRealArg(complex_one));
+ kernel.SetArgument(4, b1_temp());
+ kernel.SetArgument(5, a2_temp());
+
+ // Runs the kernel again
+ auto eventKernel2 = Event();
+ RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
+ eventWaitList.push_back(eventKernel2);
+
+ // Runs the post-processing kernel
+ auto upper = (triangle == Triangle::kUpper);
+ auto lower = (triangle == Triangle::kLower);
+ PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ n, n, c_ld, c_offset, c_buffer,
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, true);
}
// =================================================================================================
diff --git a/src/routines/level3/xher2k.hpp b/src/routines/level3/xher2k.hpp
index 23996219..acc346e4 100644
--- a/src/routines/level3/xher2k.hpp
+++ b/src/routines/level3/xher2k.hpp
@@ -30,13 +30,13 @@ class Xher2k: public Routine {
Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");
// Templated-precision implementation of the routine
- StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const U beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+ void DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const U beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index 77422526..ae8e9324 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -22,8 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@@ -32,14 +31,14 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T, typename U>
-StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const U alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@@ -47,7 +46,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+ if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
// to matrix A (argument: conjugate transpose)
@@ -70,10 +69,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
// space. Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
- auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+ TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]);
@@ -82,106 +79,92 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
- // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
- try {
-
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
- // Determines whether or not temporary matrices are needed
- auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
- a_rotated == false && a_conjugate == false;
- auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
- a_rotated == false && b_conjugate == false;
-
- // Creates the temporary matrices
- auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
-
- // Convert the arguments to complex versions
- auto complex_alpha = T{alpha, static_cast<U>(0.0)};
- auto complex_beta = T{beta, static_cast<U>(0.0)};
-
- // Events of all kernels (including pre/post processing kernels)
- auto eventWaitList = std::vector<Event>();
- auto emptyEventList = std::vector<Event>();
-
- // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
- // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
- // case nothing has to be done, these kernels can be skipped. Two copies are created.
- if (!a_no_temp) {
- auto eventProcessA = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
- a_one, a_two, a_ld, a_offset, a_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- ConstantOne<T>(), program,
- true, a_rotated, a_conjugate);
- eventWaitList.push_back(eventProcessA);
- if (ErrorIn(status)) { return status; }
- }
- if (!b_no_temp) {
- auto eventProcessB = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
- a_one, a_two, a_ld, a_offset, a_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
- ConstantOne<T>(), program,
- true, a_rotated, b_conjugate);
- eventWaitList.push_back(eventProcessB);
- if (ErrorIn(status)) { return status; }
- }
-
- // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
- // modify the other triangle.
- auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
- n, n, c_ld, c_offset, c_buffer,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- ConstantOne<T>(), program,
- true, c_rotated, false);
- eventWaitList.push_back(eventProcessC);
- if (ErrorIn(status)) { return status; }
-
- // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
- try {
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(n_ceiled));
- kernel.SetArgument(1, static_cast<int>(k_ceiled));
- kernel.SetArgument(2, GetRealArg(complex_alpha));
- kernel.SetArgument(3, GetRealArg(complex_beta));
- kernel.SetArgument(4, a_temp());
- kernel.SetArgument(5, b_temp());
- kernel.SetArgument(6, c_temp());
-
- // Computes the global and local thread sizes
- auto global = std::vector<size_t>{
- (n_ceiled * db_["MDIMC"]) / db_["MWG"],
- (n_ceiled * db_["NDIMC"]) / db_["NWG"]
- };
- auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
- // Launches the kernel
- auto eventKernel = Event();
- status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel);
-
- // Runs the post-processing kernel
- auto upper = (triangle == Triangle::kUpper);
- auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- n, n, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
- false, c_rotated, false, upper, lower, true);
- if (ErrorIn(status)) { return status; }
-
- // Successfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // Loads the program from the database
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+ // Determines whether or not temporary matrices are needed
+ auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+ a_rotated == false && a_conjugate == false;
+ auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+ a_rotated == false && b_conjugate == false;
+
+ // Creates the temporary matrices
+ auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+
+ // Convert the arguments to complex versions
+ auto complex_alpha = T{alpha, static_cast<U>(0.0)};
+ auto complex_beta = T{beta, static_cast<U>(0.0)};
+
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
+ // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+ // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+ // case nothing has to be done, these kernels can be skipped. Two copies are created.
+ if (!a_no_temp) {
+ auto eventProcessA = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+ ConstantOne<T>(), program,
+ true, a_rotated, a_conjugate);
+ eventWaitList.push_back(eventProcessA);
+ }
+ if (!b_no_temp) {
+ auto eventProcessB = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+ ConstantOne<T>(), program,
+ true, a_rotated, b_conjugate);
+ eventWaitList.push_back(eventProcessB);
+ }
+
+ // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+ // modify the other triangle.
+ auto eventProcessC = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+ n, n, c_ld, c_offset, c_buffer,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
+ eventWaitList.push_back(eventProcessC);
+
+ // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n_ceiled));
+ kernel.SetArgument(1, static_cast<int>(k_ceiled));
+ kernel.SetArgument(2, GetRealArg(complex_alpha));
+ kernel.SetArgument(3, GetRealArg(complex_beta));
+ kernel.SetArgument(4, a_temp());
+ kernel.SetArgument(5, b_temp());
+ kernel.SetArgument(6, c_temp());
+
+ // Computes the global and local thread sizes
+ auto global = std::vector<size_t>{
+ (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+ (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+ };
+ auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+ // Launches the kernel
+ auto eventKernel = Event();
+ RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
+ eventWaitList.push_back(eventKernel);
+
+ // Runs the post-processing kernel
+ auto upper = (triangle == Triangle::kUpper);
+ auto lower = (triangle == Triangle::kLower);
+ PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ n, n, c_ld, c_offset, c_buffer,
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, true);
}
// =================================================================================================
diff --git a/src/routines/level3/xherk.hpp b/src/routines/level3/xherk.hpp
index 3f156a1b..51f29d7e 100644
--- a/src/routines/level3/xherk.hpp
+++ b/src/routines/level3/xherk.hpp
@@ -30,12 +30,12 @@ class Xherk: public Routine {
Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");
// Templated-precision implementation of the routine
- StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const U alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const U beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+ void DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const size_t n, const size_t k,
+ const U alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const U beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
diff --git a/src/routines/level3/xsymm.cpp b/src/routines/level3/xsymm.cpp
index 04e4b718..d7f771d1 100644
--- a/src/routines/level3/xsymm.cpp
+++ b/src/routines/level3/xsymm.cpp
@@ -29,7 +29,7 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
+void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@@ -38,15 +38,14 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+ if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
// left) or B (on the right) in the Xgemm routine.
auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the squared A matrix
- auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(k, k, a_buffer, a_offset, a_ld);
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
// default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
@@ -55,73 +54,68 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
// Temporary buffer for a copy of the symmetric matrix
- try {
- auto temp_symm = Buffer<T>(context_, k*k);
-
- // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
- // routine afterwards
+ auto temp_symm = Buffer<T>(context_, k*k);
+
+ // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
+ // routine afterwards
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the arguments for the symmetric-to-squared kernel
+ kernel.SetArgument(0, static_cast<int>(k));
+ kernel.SetArgument(1, static_cast<int>(a_ld));
+ kernel.SetArgument(2, static_cast<int>(a_offset));
+ kernel.SetArgument(3, a_buffer());
+ kernel.SetArgument(4, static_cast<int>(k));
+ kernel.SetArgument(5, static_cast<int>(k));
+ kernel.SetArgument(6, static_cast<int>(0));
+ kernel.SetArgument(7, temp_symm());
+
+ // Uses the common padding kernel's thread configuration. This is allowed, since the
+ // symmetric-to-squared kernel uses the same parameters.
+ auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+ Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+ auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
+
+ // Synchronize now: 'DoGemm' does not accept a list of events to wait for
+ kernelEvent.WaitForCompletion();
+
+ // Runs the regular Xgemm code with either "C := AB+C" or ...
+ if (side == Side::kLeft) {
+ DoGemm(layout, Transpose::kNo, Transpose::kNo,
+ m, n, k,
+ alpha,
+ temp_symm, 0, k,
+ b_buffer, b_offset, b_ld,
+ beta,
+ c_buffer, c_offset, c_ld);
+ }
+
+ // ... with "C := BA+C". Note that A and B are now reversed.
+ else {
try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the arguments for the symmetric-to-squared kernel
- kernel.SetArgument(0, static_cast<int>(k));
- kernel.SetArgument(1, static_cast<int>(a_ld));
- kernel.SetArgument(2, static_cast<int>(a_offset));
- kernel.SetArgument(3, a_buffer());
- kernel.SetArgument(4, static_cast<int>(k));
- kernel.SetArgument(5, static_cast<int>(k));
- kernel.SetArgument(6, static_cast<int>(0));
- kernel.SetArgument(7, temp_symm());
-
- // Uses the common padding kernel's thread configuration. This is allowed, since the
- // symmetric-to-squared kernel uses the same parameters.
- auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
- Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
- auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
-
- // Synchronize now: 'DoGemm' does not accept a list of events to wait for
- kernelEvent.WaitForCompletion();
-
- // Runs the regular Xgemm code with either "C := AB+C" or ...
- if (side == Side::kLeft) {
- status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
- m, n, k,
- alpha,
- temp_symm, 0, k,
- b_buffer, b_offset, b_ld,
- beta,
- c_buffer, c_offset, c_ld);
- }
-
- // ... with "C := BA+C". Note that A and B are now reversed.
- else {
- status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
- m, n, k,
- alpha,
- b_buffer, b_offset, b_ld,
- temp_symm, 0, k,
- beta,
- c_buffer, c_offset, c_ld);
-
- // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
- switch(status) {
- case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break;
- case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break;
- case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break;
- case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break;
- case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
- case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
- }
+ DoGemm(layout, Transpose::kNo, Transpose::kNo,
+ m, n, k,
+ alpha,
+ b_buffer, b_offset, b_ld,
+ temp_symm, 0, k,
+ beta,
+ c_buffer, c_offset, c_ld);
+ } catch (BLASError &e) {
+ // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+ switch(e.status()) {
+ case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details());
+ case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details());
+ case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details());
+ case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details());
+ case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details());
+ case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details());
+ default: throw;
}
-
- // Return the status of the Xgemm routine
- return status;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ }
+ }
}
// =================================================================================================
diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp
index 428f78ef..ee965364 100644
--- a/src/routines/level3/xsymm.hpp
+++ b/src/routines/level3/xsymm.hpp
@@ -39,13 +39,13 @@ class Xsymm: public Xgemm<T> {
Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM");
// Templated-precision implementation of the routine
- StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+ void DoSymm(const Layout layout, const Side side, const Triangle triangle,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp
index badf3100..cb0e0461 100644
--- a/src/routines/level3/xsyr2k.cpp
+++ b/src/routines/level3/xsyr2k.cpp
@@ -22,8 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@@ -32,14 +31,14 @@ Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@@ -48,7 +47,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+ if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed.
@@ -67,12 +66,9 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
- auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
+ TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
+ TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]);
@@ -81,114 +77,99 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
- // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
- try {
-
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
- // Determines whether or not temporary matrices are needed
- auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
- ab_rotated == false;
- auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
- ab_rotated == false;
-
- // Creates the temporary matrices
- auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
-
- // Events of all kernels (including pre/post processing kernels)
- auto eventWaitList = std::vector<Event>();
- auto emptyEventList = std::vector<Event>();
-
- // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
- // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
- // case nothing has to be done, these kernels can be skipped.
- if (!a_no_temp) {
- auto eventProcessA = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
- ab_one, ab_two, a_ld, a_offset, a_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- ConstantOne<T>(), program,
- true, ab_rotated, false);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessA);
- }
- if (!b_no_temp) {
- auto eventProcessB = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
- ab_one, ab_two, b_ld, b_offset, b_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
- ConstantOne<T>(), program,
- true, ab_rotated, false);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessB);
- }
-
- // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
- // modify the other triangle.
- auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
- n, n, c_ld, c_offset, c_buffer,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- ConstantOne<T>(), program,
- true, c_rotated, false);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessC);
-
- // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
- try {
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(n_ceiled));
- kernel.SetArgument(1, static_cast<int>(k_ceiled));
- kernel.SetArgument(2, GetRealArg(alpha));
- kernel.SetArgument(3, GetRealArg(beta));
- kernel.SetArgument(4, a_temp());
- kernel.SetArgument(5, b_temp());
- kernel.SetArgument(6, c_temp());
-
- // Computes the global and local thread sizes
- auto global = std::vector<size_t>{
- (n_ceiled * db_["MDIMC"]) / db_["MWG"],
- (n_ceiled * db_["NDIMC"]) / db_["NWG"]
- };
- auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
- // Launches the kernel
- auto eventKernel1 = Event();
- status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel1);
-
- // Swaps the arguments for matrices A and B, and sets 'beta' to 1
- auto one = static_cast<T>(1);
- kernel.SetArgument(3, GetRealArg(one));
- kernel.SetArgument(4, b_temp());
- kernel.SetArgument(5, a_temp());
-
- // Runs the kernel again
- auto eventKernel2 = Event();
- status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel2);
-
- // Runs the post-processing kernel
- auto upper = (triangle == Triangle::kUpper);
- auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- n, n, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
- false, c_rotated, false, upper, lower, false);
- if (ErrorIn(status)) { return status; }
-
- // Successfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // Loads the program from the database
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+ // Determines whether or not temporary matrices are needed
+ auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+ ab_rotated == false;
+ auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+ ab_rotated == false;
+
+ // Creates the temporary matrices
+ auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
+ // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+ // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+ // case nothing has to be done, these kernels can be skipped.
+ if (!a_no_temp) {
+ auto eventProcessA = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+ ab_one, ab_two, a_ld, a_offset, a_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+ ConstantOne<T>(), program,
+ true, ab_rotated, false);
+ eventWaitList.push_back(eventProcessA);
+ }
+ if (!b_no_temp) {
+ auto eventProcessB = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
+ ab_one, ab_two, b_ld, b_offset, b_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+ ConstantOne<T>(), program,
+ true, ab_rotated, false);
+ eventWaitList.push_back(eventProcessB);
+ }
+
+ // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+ // modify the other triangle.
+ auto eventProcessC = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+ n, n, c_ld, c_offset, c_buffer,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
+ eventWaitList.push_back(eventProcessC);
+
+ // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n_ceiled));
+ kernel.SetArgument(1, static_cast<int>(k_ceiled));
+ kernel.SetArgument(2, GetRealArg(alpha));
+ kernel.SetArgument(3, GetRealArg(beta));
+ kernel.SetArgument(4, a_temp());
+ kernel.SetArgument(5, b_temp());
+ kernel.SetArgument(6, c_temp());
+
+ // Computes the global and local thread sizes
+ auto global = std::vector<size_t>{
+ (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+ (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+ };
+ auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+ // Launches the kernel
+ auto eventKernel1 = Event();
+ RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
+ eventWaitList.push_back(eventKernel1);
+
+ // Swaps the arguments for matrices A and B, and sets 'beta' to 1
+ auto one = static_cast<T>(1);
+ kernel.SetArgument(3, GetRealArg(one));
+ kernel.SetArgument(4, b_temp());
+ kernel.SetArgument(5, a_temp());
+
+ // Runs the kernel again
+ auto eventKernel2 = Event();
+ RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
+ eventWaitList.push_back(eventKernel2);
+
+ // Runs the post-processing kernel
+ auto upper = (triangle == Triangle::kUpper);
+ auto lower = (triangle == Triangle::kLower);
+ PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ n, n, c_ld, c_offset, c_buffer,
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, false);
}
// =================================================================================================
diff --git a/src/routines/level3/xsyr2k.hpp b/src/routines/level3/xsyr2k.hpp
index 56185653..a02c6e16 100644
--- a/src/routines/level3/xsyr2k.hpp
+++ b/src/routines/level3/xsyr2k.hpp
@@ -30,13 +30,13 @@ class Xsyr2k: public Routine {
Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");
// Templated-precision implementation of the routine
- StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+ void DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index 438aa218..bd6c4b25 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -22,8 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
@@ -32,14 +31,14 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@@ -47,7 +46,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
- if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+ if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed.
@@ -65,10 +64,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// space. Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
- auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+ TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]);
@@ -77,90 +74,76 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
- // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
- try {
-
- // Loads the program from the database
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-
- // Determines whether or not temporary matrices are needed
- auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
- a_rotated == false;
-
- // Creates the temporary matrices
- auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
- auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
-
- // Events of all kernels (including pre/post processing kernels)
- auto eventWaitList = std::vector<Event>();
- auto emptyEventList = std::vector<Event>();
-
- // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
- // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
- // case nothing has to be done, these kernels can be skipped.
- if (!a_no_temp) {
- auto eventProcessA = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
- a_one, a_two, a_ld, a_offset, a_buffer,
- n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- ConstantOne<T>(), program,
- true, a_rotated, false);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessA);
- }
-
- // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
- // modify the other triangle.
- auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
- n, n, c_ld, c_offset, c_buffer,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- ConstantOne<T>(), program,
- true, c_rotated, false);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventProcessC);
-
- // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
- try {
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- kernel.SetArgument(0, static_cast<int>(n_ceiled));
- kernel.SetArgument(1, static_cast<int>(k_ceiled));
- kernel.SetArgument(2, GetRealArg(alpha));
- kernel.SetArgument(3, GetRealArg(beta));
- kernel.SetArgument(4, a_temp());
- kernel.SetArgument(5, a_temp());
- kernel.SetArgument(6, c_temp());
-
- // Computes the global and local thread sizes
- auto global = std::vector<size_t>{
- (n_ceiled * db_["MDIMC"]) / db_["MWG"],
- (n_ceiled * db_["NDIMC"]) / db_["NWG"]
- };
- auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
- // Launches the kernel
- auto eventKernel = Event();
- status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
- if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel);
-
- // Runs the post-processing kernel
- auto upper = (triangle == Triangle::kUpper);
- auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
- n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- n, n, c_ld, c_offset, c_buffer,
- ConstantOne<T>(), program,
- false, c_rotated, false, upper, lower, false);
- if (ErrorIn(status)) { return status; }
-
-
- // Successfully finished the computation
- return StatusCode::kSuccess;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // Loads the program from the database
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+ // Determines whether or not temporary matrices are needed
+ auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+ a_rotated == false;
+
+ // Creates the temporary matrices
+ auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
+ // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+ // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+ // case nothing has to be done, these kernels can be skipped.
+ if (!a_no_temp) {
+ auto eventProcessA = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
+ n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+ ConstantOne<T>(), program,
+ true, a_rotated, false);
+ eventWaitList.push_back(eventProcessA);
+ }
+
+ // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+ // modify the other triangle.
+ auto eventProcessC = Event();
+ PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+ n, n, c_ld, c_offset, c_buffer,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
+ eventWaitList.push_back(eventProcessC);
+
+ // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n_ceiled));
+ kernel.SetArgument(1, static_cast<int>(k_ceiled));
+ kernel.SetArgument(2, GetRealArg(alpha));
+ kernel.SetArgument(3, GetRealArg(beta));
+ kernel.SetArgument(4, a_temp());
+ kernel.SetArgument(5, a_temp());
+ kernel.SetArgument(6, c_temp());
+
+ // Computes the global and local thread sizes
+ auto global = std::vector<size_t>{
+ (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+ (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+ };
+ auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+ // Launches the kernel
+ auto eventKernel = Event();
+ RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
+ eventWaitList.push_back(eventKernel);
+
+ // Runs the post-processing kernel
+ auto upper = (triangle == Triangle::kUpper);
+ auto lower = (triangle == Triangle::kLower);
+ PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ n, n, c_ld, c_offset, c_buffer,
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, false);
}
// =================================================================================================
diff --git a/src/routines/level3/xsyrk.hpp b/src/routines/level3/xsyrk.hpp
index 7c075c26..de42b824 100644
--- a/src/routines/level3/xsyrk.hpp
+++ b/src/routines/level3/xsyrk.hpp
@@ -32,12 +32,12 @@ class Xsyrk: public Routine {
Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");
// Templated-precision implementation of the routine
- StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+ void DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp
index 74a82822..6bf77cfa 100644
--- a/src/routines/level3/xtrmm.cpp
+++ b/src/routines/level3/xtrmm.cpp
@@ -29,7 +29,7 @@ Xtrmm<T>::Xtrmm(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
-StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
@@ -37,15 +37,14 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
// Makes sure all dimensions are larger than zero
- if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
+ if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes the k dimension. This is based on whether or not matrix is A (on the left)
// or B (on the right) in the Xgemm routine.
auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the triangular A matrix
- auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(k, k, a_buffer, a_offset, a_ld);
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
// default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
@@ -57,74 +56,69 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
// Temporary buffer for a copy of the triangular matrix
- try {
- auto temp_triangular = Buffer<T>(context_, k*k);
-
- // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
- // routine afterwards
+ auto temp_triangular = Buffer<T>(context_, k*k);
+
+ // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
+ // routine afterwards
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the arguments for the triangular-to-squared kernel
+ kernel.SetArgument(0, static_cast<int>(k));
+ kernel.SetArgument(1, static_cast<int>(a_ld));
+ kernel.SetArgument(2, static_cast<int>(a_offset));
+ kernel.SetArgument(3, a_buffer());
+ kernel.SetArgument(4, static_cast<int>(k));
+ kernel.SetArgument(5, static_cast<int>(k));
+ kernel.SetArgument(6, static_cast<int>(0));
+ kernel.SetArgument(7, temp_triangular());
+ kernel.SetArgument(8, static_cast<int>(unit_diagonal));
+
+ // Uses the common padding kernel's thread configuration. This is allowed, since the
+ // triangular-to-squared kernel uses the same parameters.
+ auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+ Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+ auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+ auto kernelEvent = Event();
+ RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
+
+ // Synchronize now: 'DoGemm' does not accept a list of events to wait for
+ kernelEvent.WaitForCompletion();
+
+ // Runs the regular Xgemm code with either "B := alpha*A*B" or ...
+ if (side == Side::kLeft) {
+ DoGemm(layout, a_transpose, Transpose::kNo,
+ m, n, k,
+ alpha,
+ temp_triangular, 0, k,
+ b_buffer, b_offset, b_ld,
+ static_cast<T>(0.0),
+ b_buffer, b_offset, b_ld);
+ }
+
+ // ... with "B := alpha*B*A". Note that A and B are now reversed.
+ else {
try {
- const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the arguments for the triangular-to-squared kernel
- kernel.SetArgument(0, static_cast<int>(k));
- kernel.SetArgument(1, static_cast<int>(a_ld));
- kernel.SetArgument(2, static_cast<int>(a_offset));
- kernel.SetArgument(3, a_buffer());
- kernel.SetArgument(4, static_cast<int>(k));
- kernel.SetArgument(5, static_cast<int>(k));
- kernel.SetArgument(6, static_cast<int>(0));
- kernel.SetArgument(7, temp_triangular());
- kernel.SetArgument(8, static_cast<int>(unit_diagonal));
-
- // Uses the common padding kernel's thread configuration. This is allowed, since the
- // triangular-to-squared kernel uses the same parameters.
- auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
- Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
- auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- auto kernelEvent = Event();
- status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
- if (ErrorIn(status)) { return status; }
-
- // Synchronize now: 'DoGemm' does not accept a list of events to wait for
- kernelEvent.WaitForCompletion();
-
- // Runs the regular Xgemm code with either "B := alpha*A*B" or ...
- if (side == Side::kLeft) {
- status = DoGemm(layout, a_transpose, Transpose::kNo,
- m, n, k,
- alpha,
- temp_triangular, 0, k,
- b_buffer, b_offset, b_ld,
- static_cast<T>(0.0),
- b_buffer, b_offset, b_ld);
- }
-
- // ... with "B := alpha*B*A". Note that A and B are now reversed.
- else {
- status = DoGemm(layout, Transpose::kNo, a_transpose,
- m, n, k,
- alpha,
- b_buffer, b_offset, b_ld,
- temp_triangular, 0, k,
- static_cast<T>(0.0),
- b_buffer, b_offset, b_ld);
-
- // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
- switch(status) {
- case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break;
- case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break;
- case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break;
- case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break;
- case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
- case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
- }
+ DoGemm(layout, Transpose::kNo, a_transpose,
+ m, n, k,
+ alpha,
+ b_buffer, b_offset, b_ld,
+ temp_triangular, 0, k,
+ static_cast<T>(0.0),
+ b_buffer, b_offset, b_ld);
+ } catch (BLASError &e) {
+ // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+ switch(e.status()) {
+ case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details());
+ case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details());
+ case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details());
+ case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details());
+ case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details());
+ case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details());
+ default: throw;
}
-
- // Return the status of the Xgemm routine
- return status;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ }
+ }
}
// =================================================================================================
diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp
index 186a120e..967bf132 100644
--- a/src/routines/level3/xtrmm.hpp
+++ b/src/routines/level3/xtrmm.hpp
@@ -38,12 +38,12 @@ class Xtrmm: public Xgemm<T> {
Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM");
// Templated-precision implementation of the routine
- StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+ void DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
};
// =================================================================================================
diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp
index af9080af..875ca7d2 100644
--- a/src/routines/levelx/xomatcopy.cpp
+++ b/src/routines/levelx/xomatcopy.cpp
@@ -22,27 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xomatcopy<T>::Xomatcopy(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue<T>()) {
- source_string_ =
+ Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
#include "../../kernels/level3/copy_pad.opencl"
#include "../../kernels/level3/transpose_fast.opencl"
#include "../../kernels/level3/transpose_pad.opencl"
- ;
+ }) {
}
// =================================================================================================
// The main routine
template <typename T>
-StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+void Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
// Makes sure all dimensions are larger than zero
- if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
+ if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
// Determines whether to transpose the matrix A
const auto transpose = (a_transpose != Transpose::kNo);
@@ -63,22 +62,17 @@ StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_trans
// Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than M when not-rotated
// matrix B cannot be less than M when rotated, or less than N when not-rotated
- auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
- if (ErrorIn(status)) { return status; }
- status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
- if (ErrorIn(status)) { return status; }
+ TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+ TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto emptyEventList = std::vector<Event>();
- status = PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList,
- a_one, a_two, a_ld, a_offset, a_buffer,
- b_one, b_two, b_ld, b_offset, b_buffer,
- alpha, program, false, transpose, conjugate);
- if (ErrorIn(status)) { return status; }
-
- return StatusCode::kSuccess;
+ PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
+ b_one, b_two, b_ld, b_offset, b_buffer,
+ alpha, program, false, transpose, conjugate);
}
// =================================================================================================
diff --git a/src/routines/levelx/xomatcopy.hpp b/src/routines/levelx/xomatcopy.hpp
index 0e580230..2da66693 100644
--- a/src/routines/levelx/xomatcopy.hpp
+++ b/src/routines/levelx/xomatcopy.hpp
@@ -28,10 +28,10 @@ class Xomatcopy: public Routine {
Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY");
// Templated-precision implementation of the routine
- StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+ void DoOmatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
};
// =================================================================================================