diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-10-22 15:05:12 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-10-22 15:05:12 +0200 |
commit | 280698d0767219e174b12e51e8e42b228bbf28e9 (patch) | |
tree | 25db4d2d360cc161ca7d8e563c847faf08a745a0 | |
parent | 9b596820d2dd833648706bff505b459c58f45b4b (diff) | |
parent | 56f300607b1d0b81ab3269894fda5a066c46cdeb (diff) |
Merge pull request #117 from intelfx/exceptions
Convert to use C++ exceptions internally
105 files changed, 4462 insertions, 4223 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index ae78b5a7..17bff79b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,6 +169,7 @@ set(SOURCES src/routines/common.cpp src/cache.cpp src/clblast.cpp + src/clblast_exceptions.cpp src/clblast_c.cpp src/routine.cpp src/utilities.cpp diff --git a/include/clblast.h b/include/clblast.h index 0f52b2f9..53e23669 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -75,13 +75,14 @@ enum class StatusCode { kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small // Custom additional status codes for CLBlast - kKernelLaunchError = -2048, // Problem occurred when enqueuing the kernel - kKernelRunError = -2047, // Problem occurred while running the kernel kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small + kDatabaseError = -2041, // Entry for the device was not found in the database + kUnknownError = -2040, // A catch-all error code representing an unspecified error + kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception }; // Matrix layout and transpose types diff --git a/include/clblast_c.h b/include/clblast_c.h index 33fb4acf..2805c20f 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -76,13 +76,14 @@ typedef enum StatusCode_ { kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small // Custom additional status codes for CLBlast - kKernelLaunchError = -2048, // Problem occurred when enqueuing the kernel - kKernelRunError = -2047, // Problem occurred while running the kernel kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small + kDatabaseError = -2041, // Entry for the device was not found in the database + kUnknownError = -2040, // A catch-all error code representing an unspecified error + kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception } StatusCode; // Matrix layout and transpose types diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index d82b13a6..04ab5475 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -30,8 +30,8 @@ from generator.routine import Routine from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU -HEADER_LINES = [96, 73, 97, 22, 29, 41] -FOOTER_LINES = [17, 75, 19, 14, 6, 6] +HEADER_LINES = [97, 73, 98, 22, 29, 41] +FOOTER_LINES = [17, 80, 19, 18, 6, 6] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 427eb180..a0d43667 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -45,17 +45,18 @@ def clblast_h(routine): def clblast_cc(routine): """The C++ API implementation (.cpp)""" - indent1 = " " * (20 + routine.length()) + indent1 = " " * (15 + routine.length()) result = NL + "// " + routine.description + ": " + routine.short_names() + NL if routine.implemented: result += routine.routine_header_cpp(12, "") + " {" + NL - result += " auto queue_cpp = Queue(*queue);" + NL - result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL - result += " auto status = routine.SetUp();" + NL - result += " if (status != StatusCode::kSuccess) { return status; }" + NL - result += " return routine.Do" + routine.name.capitalize() + "(" + result += " try {" + NL + result += " auto queue_cpp = Queue(*queue);" + NL + result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL + result += " routine.Do" + routine.name.capitalize() + "(" result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()]) result += ");" + NL + result += " return StatusCode::kSuccess;" + NL + result += " } catch (...) { return DispatchException(); }" + NL else: result += routine.routine_header_type_cpp(12) + " {" + NL result += " return StatusCode::kNotImplemented;" + NL @@ -81,12 +82,14 @@ def clblast_c_cc(routine): result = NL + "// " + routine.name.upper() + NL for flavour in routine.flavours: template = "<" + flavour.template + ">" if routine.no_scalars() else "" - indent = " " * (26 + routine.length() + len(template)) + indent = " " * (45 + routine.length() + len(template)) result += routine.routine_header_c(flavour, 20, "") + " {" + NL - result += " auto status = clblast::" + routine.name.capitalize() + template + "(" + result += " try {" + NL + result += " return static_cast<StatusCode>(clblast::" + routine.name.capitalize() + template + "(" result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)]) - result += "," + NL + indent + "queue, event);" - result += NL + " return static_cast<StatusCode>(status);" + NL + "}" + NL + result += "," + NL + indent + "queue, event));" + NL + result += " } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); }" + NL + result += "}" + NL return result diff --git a/src/buffer_test.hpp b/src/buffer_test.hpp index 80f5243f..9a23e0b7 100644 --- a/src/buffer_test.hpp +++ b/src/buffer_test.hpp @@ -22,96 +22,88 @@ namespace clblast { // Tests matrix 'A' for validity template <typename T> -StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer, +void TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer, const size_t offset, const size_t ld) { - if (ld < one) { return StatusCode::kInvalidLeadDimA; } + if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimA); } try { const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; } - } catch (...) { return StatusCode::kInvalidMatrixA; } - return StatusCode::kSuccess; + if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); } + } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixA, e.what()); } } // Tests matrix 'B' for validity template <typename T> -StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer, +void TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer, const size_t offset, const size_t ld) { - if (ld < one) { return StatusCode::kInvalidLeadDimB; } + if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimB); } try { const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; } - } catch (...) { return StatusCode::kInvalidMatrixB; } - return StatusCode::kSuccess; + if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryB); } + } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixB, e.what()); } } // Tests matrix 'C' for validity template <typename T> -StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer, +void TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer, const size_t offset, const size_t ld) { - if (ld < one) { return StatusCode::kInvalidLeadDimC; } + if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimC); } try { const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; } - } catch (...) { return StatusCode::kInvalidMatrixC; } - return StatusCode::kSuccess; + if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryC); } + } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixC, e.what()); } } // Tests matrix 'AP' for validity template <typename T> -StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) { +void TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) { try { const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; } - } catch (...) { return StatusCode::kInvalidMatrixA; } - return StatusCode::kSuccess; + if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); } + } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixA, e.what()); } } // ================================================================================================= // Tests vector 'X' for validity template <typename T> -StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset, +void TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset, const size_t inc) { - if (inc == 0) { return StatusCode::kInvalidIncrementX; } + if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementX); } try { const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; } - } catch (...) { return StatusCode::kInvalidVectorX; } - return StatusCode::kSuccess; + if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryX); } + } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorX, e.what()); } } // Tests vector 'Y' for validity template <typename T> -StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset, +void TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset, const size_t inc) { - if (inc == 0) { return StatusCode::kInvalidIncrementY; } + if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementY); } try { const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; } - } catch (...) { return StatusCode::kInvalidVectorY; } - return StatusCode::kSuccess; + if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryY); } + } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorY, e.what()); } } // ================================================================================================= // Tests vector 'scalar' for validity template <typename T> -StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) { +void TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) { try { const auto required_size = (n + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; } - } catch (...) { return StatusCode::kInvalidVectorScalar; } - return StatusCode::kSuccess; + if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryScalar); } + } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorScalar, e.what()); } } // Tests vector 'index' for validity template <typename T> -StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) { +void TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) { try { const auto required_size = (n + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; } - } catch (...) { return StatusCode::kInvalidVectorScalar; } - return StatusCode::kSuccess; + if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryScalar); } + } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorScalar, e.what()); } } // ================================================================================================= diff --git a/src/cache.cpp b/src/cache.cpp index 6080f082..6786eaa2 100644 --- a/src/cache.cpp +++ b/src/cache.cpp @@ -57,7 +57,7 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec } } binary_cache_mutex_.unlock(); - throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none."); + throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none"); } // Queries the cache and retrieves a matching program. Assumes that the match is available, throws @@ -75,7 +75,7 @@ const Program& GetProgramFromCache(const Context &context, const Precision &prec } } program_cache_mutex_.unlock(); - throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none."); + throw LogicError("GetProgramFromCache: Expected program in cache, but found none"); } // Queries the cache to see whether or not the compiled kernel is already there @@ -109,14 +109,13 @@ bool ProgramIsInCache(const Context &context, const Precision &precision, // ================================================================================================= // Clears the cache of stored binaries and programs -StatusCode CacheClearAll() { +void CacheClearAll() { binary_cache_mutex_.lock(); binary_cache_.clear(); binary_cache_mutex_.unlock(); program_cache_mutex_.lock(); program_cache_.clear(); program_cache_mutex_.unlock(); - return StatusCode::kSuccess; } // ================================================================================================= diff --git a/src/cache.hpp b/src/cache.hpp index 9075da0d..f2b44edf 100644 --- a/src/cache.hpp +++ b/src/cache.hpp @@ -89,7 +89,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision, // ================================================================================================= // Clears the cache of stored binaries -StatusCode CacheClearAll(); +void CacheClearAll(); // ================================================================================================= } // namespace clblast diff --git a/src/clblast.cpp b/src/clblast.cpp index 79c30ca4..4bb4e0b3 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -168,13 +168,14 @@ StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xswap<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSwap(n, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xswap<T>(queue_cpp, event); + routine.DoSwap(n, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Swap<float>(const size_t, cl_mem, const size_t, const size_t, @@ -203,13 +204,14 @@ StatusCode Scal(const size_t n, const T alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xscal<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoScal(n, - alpha, - Buffer<T>(x_buffer), x_offset, x_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xscal<T>(queue_cpp, event); + routine.DoScal(n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Scal<float>(const size_t, const float, @@ -238,13 +240,14 @@ StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xcopy<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoCopy(n, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xcopy<T>(queue_cpp, event); + routine.DoCopy(n, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Copy<float>(const size_t, const cl_mem, const size_t, const size_t, @@ -274,14 +277,15 @@ StatusCode Axpy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xaxpy<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoAxpy(n, - alpha, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xaxpy<T>(queue_cpp, event); + routine.DoAxpy(n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Axpy<float>(const size_t, const float, @@ -316,14 +320,15 @@ StatusCode Dot(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xdot<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoDot(n, - Buffer<T>(dot_buffer), dot_offset, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xdot<T>(queue_cpp, event); + routine.DoDot(n, + Buffer<T>(dot_buffer), dot_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dot<float>(const size_t, cl_mem, const size_t, @@ -348,14 +353,15 @@ StatusCode Dotu(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xdotu<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoDotu(n, - Buffer<T>(dot_buffer), dot_offset, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xdotu<T>(queue_cpp, event); + routine.DoDotu(n, + Buffer<T>(dot_buffer), dot_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dotu<float2>(const size_t, cl_mem, const size_t, @@ -375,14 +381,15 @@ StatusCode Dotc(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xdotc<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoDotc(n, - Buffer<T>(dot_buffer), dot_offset, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xdotc<T>(queue_cpp, event); + routine.DoDotc(n, + Buffer<T>(dot_buffer), dot_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dotc<float2>(const size_t, cl_mem, const size_t, @@ -401,13 +408,14 @@ StatusCode Nrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xnrm2<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoNrm2(n, - Buffer<T>(nrm2_buffer), nrm2_offset, - Buffer<T>(x_buffer), x_offset, x_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xnrm2<T>(queue_cpp, event); + routine.DoNrm2(n, + Buffer<T>(nrm2_buffer), nrm2_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Nrm2<float>(const size_t, cl_mem, const size_t, @@ -436,13 +444,14 @@ StatusCode Asum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xasum<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoAsum(n, - Buffer<T>(asum_buffer), asum_offset, - Buffer<T>(x_buffer), x_offset, x_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xasum<T>(queue_cpp, event); + routine.DoAsum(n, + Buffer<T>(asum_buffer), asum_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Asum<float>(const size_t, cl_mem, const size_t, @@ -471,13 +480,14 @@ StatusCode Sum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsum<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSum(n, - Buffer<T>(sum_buffer), sum_offset, - Buffer<T>(x_buffer), x_offset, x_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsum<T>(queue_cpp, event); + routine.DoSum(n, + Buffer<T>(sum_buffer), sum_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Sum<float>(const size_t, cl_mem, const size_t, @@ -506,13 +516,14 @@ StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xamax<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoAmax(n, - Buffer<unsigned int>(imax_buffer), imax_offset, - Buffer<T>(x_buffer), x_offset, x_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xamax<T>(queue_cpp, event); + routine.DoAmax(n, + Buffer<unsigned int>(imax_buffer), imax_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Amax<float>(const size_t, cl_mem, const size_t, @@ -541,13 +552,14 @@ StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xmax<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoMax(n, - Buffer<unsigned int>(imax_buffer), imax_offset, - Buffer<T>(x_buffer), x_offset, x_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xmax<T>(queue_cpp, event); + routine.DoMax(n, + Buffer<unsigned int>(imax_buffer), imax_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Max<float>(const size_t, cl_mem, const size_t, @@ -576,13 +588,14 @@ StatusCode Min(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xmin<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoMin(n, - Buffer<unsigned int>(imin_buffer), imin_offset, - Buffer<T>(x_buffer), x_offset, x_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xmin<T>(queue_cpp, event); + routine.DoMin(n, + Buffer<unsigned int>(imin_buffer), imin_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Min<float>(const size_t, cl_mem, const size_t, @@ -619,17 +632,18 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xgemv<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoGemv(layout, a_transpose, - m, n, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(x_buffer), x_offset, x_inc, - beta, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgemv<T>(queue_cpp, event); + routine.DoGemv(layout, a_transpose, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gemv<float>(const Layout, const Transpose, const size_t, const size_t, @@ -682,17 +696,18 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xgbmv<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoGbmv(layout, a_transpose, - m, n, kl, ku, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(x_buffer), x_offset, x_inc, - beta, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgbmv<T>(queue_cpp, event); + routine.DoGbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gbmv<float>(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, @@ -745,17 +760,18 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xhemv<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHemv(layout, triangle, - n, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(x_buffer), x_offset, x_inc, - beta, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhemv<T>(queue_cpp, event); + routine.DoHemv(layout, triangle, + n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hemv<float2>(const Layout, const Triangle, const size_t, @@ -784,17 +800,18 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xhbmv<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHbmv(layout, triangle, - n, k, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(x_buffer), x_offset, x_inc, - beta, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhbmv<T>(queue_cpp, event); + routine.DoHbmv(layout, triangle, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hbmv<float2>(const Layout, const Triangle, const size_t, const size_t, @@ -823,17 +840,18 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xhpmv<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHpmv(layout, triangle, - n, - alpha, - Buffer<T>(ap_buffer), ap_offset, - Buffer<T>(x_buffer), x_offset, x_inc, - beta, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhpmv<T>(queue_cpp, event); + routine.DoHpmv(layout, triangle, + n, + alpha, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpmv<float2>(const Layout, const Triangle, const size_t, @@ -862,17 +880,18 @@ StatusCode Symv(const Layout layout, const Triangle triangle, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsymv<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSymv(layout, triangle, - n, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(x_buffer), x_offset, x_inc, - beta, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsymv<T>(queue_cpp, event); + routine.DoSymv(layout, triangle, + n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Symv<float>(const Layout, const Triangle, const size_t, @@ -909,17 +928,18 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsbmv<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSbmv(layout, triangle, - n, k, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(x_buffer), x_offset, x_inc, - beta, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsbmv<T>(queue_cpp, event); + routine.DoSbmv(layout, triangle, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Sbmv<float>(const Layout, const Triangle, const size_t, const size_t, @@ -956,17 +976,18 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xspmv<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSpmv(layout, triangle, - n, - alpha, - Buffer<T>(ap_buffer), ap_offset, - Buffer<T>(x_buffer), x_offset, x_inc, - beta, - Buffer<T>(y_buffer), y_offset, y_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xspmv<T>(queue_cpp, event); + routine.DoSpmv(layout, triangle, + n, + alpha, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spmv<float>(const Layout, const Triangle, const size_t, @@ -1000,14 +1021,15 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xtrmv<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoTrmv(layout, triangle, a_transpose, diagonal, - n, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(x_buffer), x_offset, x_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrmv<T>(queue_cpp, event); + routine.DoTrmv(layout, triangle, a_transpose, diagonal, + n, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, @@ -1042,14 +1064,15 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xtbmv<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoTbmv(layout, triangle, a_transpose, diagonal, - n, k, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(x_buffer), x_offset, x_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtbmv<T>(queue_cpp, event); + routine.DoTbmv(layout, triangle, a_transpose, diagonal, + n, k, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, @@ -1084,14 +1107,15 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_ const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xtpmv<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoTpmv(layout, triangle, a_transpose, diagonal, - n, - Buffer<T>(ap_buffer), ap_offset, - Buffer<T>(x_buffer), x_offset, x_inc); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtpmv<T>(queue_cpp, event); + routine.DoTpmv(layout, triangle, a_transpose, diagonal, + n, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, @@ -1218,16 +1242,17 @@ StatusCode Ger(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xger<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoGer(layout, - m, n, - alpha, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc, - Buffer<T>(a_buffer), a_offset, a_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xger<T>(queue_cpp, event); + routine.DoGer(layout, + m, n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Ger<float>(const Layout, const size_t, const size_t, @@ -1260,16 +1285,17 @@ StatusCode Geru(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xgeru<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoGeru(layout, - m, n, - alpha, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc, - Buffer<T>(a_buffer), a_offset, a_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgeru<T>(queue_cpp, event); + routine.DoGeru(layout, + m, n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Geru<float2>(const Layout, const size_t, const size_t, @@ -1295,16 +1321,17 @@ StatusCode Gerc(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xgerc<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoGerc(layout, - m, n, - alpha, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc, - Buffer<T>(a_buffer), a_offset, a_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgerc<T>(queue_cpp, event); + routine.DoGerc(layout, + m, n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gerc<float2>(const Layout, const size_t, const size_t, @@ -1329,15 +1356,16 @@ StatusCode Her(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xher<std::complex<T>,T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHer(layout, triangle, - n, - alpha, - Buffer<std::complex<T>>(x_buffer), x_offset, x_inc, - Buffer<std::complex<T>>(a_buffer), a_offset, a_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xher<std::complex<T>,T>(queue_cpp, event); + routine.DoHer(layout, triangle, + n, + alpha, + Buffer<std::complex<T>>(x_buffer), x_offset, x_inc, + Buffer<std::complex<T>>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her<float>(const Layout, const Triangle, const size_t, @@ -1360,15 +1388,16 @@ StatusCode Hpr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHpr(layout, triangle, - n, - alpha, - Buffer<std::complex<T>>(x_buffer), x_offset, x_inc, - Buffer<std::complex<T>>(ap_buffer), ap_offset); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event); + routine.DoHpr(layout, triangle, + n, + alpha, + Buffer<std::complex<T>>(x_buffer), x_offset, x_inc, + Buffer<std::complex<T>>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpr<float>(const Layout, const Triangle, const size_t, @@ -1392,16 +1421,17 @@ StatusCode Her2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xher2<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHer2(layout, triangle, - n, - alpha, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc, - Buffer<T>(a_buffer), a_offset, a_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xher2<T>(queue_cpp, event); + routine.DoHer2(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her2<float2>(const Layout, const Triangle, const size_t, @@ -1427,16 +1457,17 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xhpr2<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHpr2(layout, triangle, - n, - alpha, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc, - Buffer<T>(ap_buffer), ap_offset); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhpr2<T>(queue_cpp, event); + routine.DoHpr2(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpr2<float2>(const Layout, const Triangle, const size_t, @@ -1461,15 +1492,16 @@ StatusCode Syr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsyr<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSyr(layout, triangle, - n, - alpha, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(a_buffer), a_offset, a_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr<T>(queue_cpp, event); + routine.DoSyr(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr<float>(const Layout, const Triangle, const size_t, @@ -1498,15 +1530,16 @@ StatusCode Spr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xspr<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSpr(layout, triangle, - n, - alpha, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(ap_buffer), ap_offset); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xspr<T>(queue_cpp, event); + routine.DoSpr(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spr<float>(const Layout, const Triangle, const size_t, @@ -1536,16 +1569,17 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsyr2<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSyr2(layout, triangle, - n, - alpha, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc, - Buffer<T>(a_buffer), a_offset, a_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr2<T>(queue_cpp, event); + routine.DoSyr2(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr2<float>(const Layout, const Triangle, const size_t, @@ -1578,16 +1612,17 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xspr2<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSpr2(layout, triangle, - n, - alpha, - Buffer<T>(x_buffer), x_offset, x_inc, - Buffer<T>(y_buffer), y_offset, y_inc, - Buffer<T>(ap_buffer), ap_offset); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xspr2<T>(queue_cpp, event); + routine.DoSpr2(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spr2<float>(const Layout, const Triangle, const size_t, @@ -1625,17 +1660,18 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xgemm<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoGemm(layout, a_transpose, b_transpose, - m, n, k, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(b_buffer), b_offset, b_ld, - beta, - Buffer<T>(c_buffer), c_offset, c_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xgemm<T>(queue_cpp, event); + routine.DoGemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gemm<float>(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, @@ -1688,17 +1724,18 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsymm<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSymm(layout, side, triangle, - m, n, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(b_buffer), b_offset, b_ld, - beta, - Buffer<T>(c_buffer), c_offset, c_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsymm<T>(queue_cpp, event); + routine.DoSymm(layout, side, triangle, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Symm<float>(const Layout, const Side, const Triangle, const size_t, const size_t, @@ -1751,17 +1788,18 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xhemm<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHemm(layout, side, triangle, - m, n, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(b_buffer), b_offset, b_ld, - beta, - Buffer<T>(c_buffer), c_offset, c_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xhemm<T>(queue_cpp, event); + routine.DoHemm(layout, side, triangle, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hemm<float2>(const Layout, const Side, const Triangle, const size_t, const size_t, @@ -1789,16 +1827,17 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsyrk<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSyrk(layout, triangle, a_transpose, - n, k, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - beta, - Buffer<T>(c_buffer), c_offset, c_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyrk<T>(queue_cpp, event); + routine.DoSyrk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syrk<float>(const Layout, const Triangle, const Transpose, const size_t, const size_t, @@ -1845,16 +1884,17 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xherk<std::complex<T>,T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHerk(layout, triangle, a_transpose, - n, k, - alpha, - Buffer<std::complex<T>>(a_buffer), a_offset, a_ld, - beta, - Buffer<std::complex<T>>(c_buffer), c_offset, c_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xherk<std::complex<T>,T>(queue_cpp, event); + routine.DoHerk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer<std::complex<T>>(a_buffer), a_offset, a_ld, + beta, + Buffer<std::complex<T>>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Herk<float>(const Layout, const Triangle, const Transpose, const size_t, const size_t, @@ -1881,17 +1921,18 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsyr2k<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSyr2k(layout, triangle, ab_transpose, - n, k, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(b_buffer), b_offset, b_ld, - beta, - Buffer<T>(c_buffer), c_offset, c_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr2k<T>(queue_cpp, event); + routine.DoSyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr2k<float>(const Layout, const Triangle, const Transpose, const size_t, const size_t, @@ -1944,17 +1985,18 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a const U beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xher2k<T,U>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHer2k(layout, triangle, ab_transpose, - n, k, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(b_buffer), b_offset, b_ld, - beta, - Buffer<T>(c_buffer), c_offset, c_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xher2k<T,U>(queue_cpp, event); + routine.DoHer2k(layout, triangle, ab_transpose, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her2k<float2,float>(const Layout, const Triangle, const Transpose, const size_t, const size_t, @@ -1981,15 +2023,16 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xtrmm<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, - m, n, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(b_buffer), b_offset, b_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrmm<T>(queue_cpp, event); + routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trmm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, @@ -2075,15 +2118,16 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xomatcopy<T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoOmatcopy(layout, a_transpose, - m, n, - alpha, - Buffer<T>(a_buffer), a_offset, a_ld, - Buffer<T>(b_buffer), b_offset, b_ld); + try { + auto queue_cpp = Queue(*queue); + auto routine = Xomatcopy<T>(queue_cpp, event); + routine.DoOmatcopy(layout, a_transpose, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Omatcopy<float>(const Layout, const Transpose, const size_t, const size_t, @@ -2119,7 +2163,12 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose, // ================================================================================================= // Clears the cache of stored binaries -StatusCode ClearCache() { return CacheClearAll(); } +StatusCode ClearCache() { + try { + CacheClearAll(); + } catch (...) { return DispatchException(); } + return StatusCode::kSuccess; +} // Fills the cache with all binaries for a specific device // TODO: Add half-precision FP16 set-up calls @@ -2132,59 +2181,59 @@ StatusCode FillCache(const cl_device_id device) { auto queue = Queue(context, device_cpp); // Runs all the level 1 set-up functions - Xswap<float>(queue, nullptr).SetUp(); Xswap<double>(queue, nullptr).SetUp(); Xswap<float2>(queue, nullptr).SetUp(); Xswap<double2>(queue, nullptr).SetUp(); - Xswap<float>(queue, nullptr).SetUp(); Xswap<double>(queue, nullptr).SetUp(); Xswap<float2>(queue, nullptr).SetUp(); Xswap<double2>(queue, nullptr).SetUp(); - Xscal<float>(queue, nullptr).SetUp(); Xscal<double>(queue, nullptr).SetUp(); Xscal<float2>(queue, nullptr).SetUp(); Xscal<double2>(queue, nullptr).SetUp(); - Xcopy<float>(queue, nullptr).SetUp(); Xcopy<double>(queue, nullptr).SetUp(); Xcopy<float2>(queue, nullptr).SetUp(); Xcopy<double2>(queue, nullptr).SetUp(); - Xaxpy<float>(queue, nullptr).SetUp(); Xaxpy<double>(queue, nullptr).SetUp(); Xaxpy<float2>(queue, nullptr).SetUp(); Xaxpy<double2>(queue, nullptr).SetUp(); - Xdot<float>(queue, nullptr).SetUp(); Xdot<double>(queue, nullptr).SetUp(); - Xdotu<float2>(queue, nullptr).SetUp(); Xdotu<double2>(queue, nullptr).SetUp(); - Xdotc<float2>(queue, nullptr).SetUp(); Xdotc<double2>(queue, nullptr).SetUp(); - Xnrm2<float>(queue, nullptr).SetUp(); Xnrm2<double>(queue, nullptr).SetUp(); Xnrm2<float2>(queue, nullptr).SetUp(); Xnrm2<double2>(queue, nullptr).SetUp(); - Xasum<float>(queue, nullptr).SetUp(); Xasum<double>(queue, nullptr).SetUp(); Xasum<float2>(queue, nullptr).SetUp(); Xasum<double2>(queue, nullptr).SetUp(); - Xsum<float>(queue, nullptr).SetUp(); Xsum<double>(queue, nullptr).SetUp(); Xsum<float2>(queue, nullptr).SetUp(); Xsum<double2>(queue, nullptr).SetUp(); - Xamax<float>(queue, nullptr).SetUp(); Xamax<double>(queue, nullptr).SetUp(); Xamax<float2>(queue, nullptr).SetUp(); Xamax<double2>(queue, nullptr).SetUp(); - Xmax<float>(queue, nullptr).SetUp(); Xmax<double>(queue, nullptr).SetUp(); Xmax<float2>(queue, nullptr).SetUp(); Xmax<double2>(queue, nullptr).SetUp(); - Xmin<float>(queue, nullptr).SetUp(); Xmin<double>(queue, nullptr).SetUp(); Xmin<float2>(queue, nullptr).SetUp(); Xmin<double2>(queue, nullptr).SetUp(); + Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr); + Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr); + Xscal<float>(queue, nullptr); Xscal<double>(queue, nullptr); Xscal<float2>(queue, nullptr); Xscal<double2>(queue, nullptr); + Xcopy<float>(queue, nullptr); Xcopy<double>(queue, nullptr); Xcopy<float2>(queue, nullptr); Xcopy<double2>(queue, nullptr); + Xaxpy<float>(queue, nullptr); Xaxpy<double>(queue, nullptr); Xaxpy<float2>(queue, nullptr); Xaxpy<double2>(queue, nullptr); + Xdot<float>(queue, nullptr); Xdot<double>(queue, nullptr); + Xdotu<float2>(queue, nullptr); Xdotu<double2>(queue, nullptr); + Xdotc<float2>(queue, nullptr); Xdotc<double2>(queue, nullptr); + Xnrm2<float>(queue, nullptr); Xnrm2<double>(queue, nullptr); Xnrm2<float2>(queue, nullptr); Xnrm2<double2>(queue, nullptr); + Xasum<float>(queue, nullptr); Xasum<double>(queue, nullptr); Xasum<float2>(queue, nullptr); Xasum<double2>(queue, nullptr); + Xsum<float>(queue, nullptr); Xsum<double>(queue, nullptr); Xsum<float2>(queue, nullptr); Xsum<double2>(queue, nullptr); + Xamax<float>(queue, nullptr); Xamax<double>(queue, nullptr); Xamax<float2>(queue, nullptr); Xamax<double2>(queue, nullptr); + Xmax<float>(queue, nullptr); Xmax<double>(queue, nullptr); Xmax<float2>(queue, nullptr); Xmax<double2>(queue, nullptr); + Xmin<float>(queue, nullptr); Xmin<double>(queue, nullptr); Xmin<float2>(queue, nullptr); Xmin<double2>(queue, nullptr); // Runs all the level 2 set-up functions - Xgemv<float>(queue, nullptr).SetUp(); Xgemv<double>(queue, nullptr).SetUp(); Xgemv<float2>(queue, nullptr).SetUp(); Xgemv<double2>(queue, nullptr).SetUp(); - Xgbmv<float>(queue, nullptr).SetUp(); Xgbmv<double>(queue, nullptr).SetUp(); Xgbmv<float2>(queue, nullptr).SetUp(); Xgbmv<double2>(queue, nullptr).SetUp(); - Xhemv<float2>(queue, nullptr).SetUp(); Xhemv<double2>(queue, nullptr).SetUp(); - Xhbmv<float2>(queue, nullptr).SetUp(); Xhbmv<double2>(queue, nullptr).SetUp(); - Xhpmv<float2>(queue, nullptr).SetUp(); Xhpmv<double2>(queue, nullptr).SetUp(); - Xsymv<float>(queue, nullptr).SetUp(); Xsymv<double>(queue, nullptr).SetUp(); - Xsbmv<float>(queue, nullptr).SetUp(); Xsbmv<double>(queue, nullptr).SetUp(); - Xspmv<float>(queue, nullptr).SetUp(); Xspmv<double>(queue, nullptr).SetUp(); - Xtrmv<float>(queue, nullptr).SetUp(); Xtrmv<double>(queue, nullptr).SetUp(); Xtrmv<float2>(queue, nullptr).SetUp(); Xtrmv<double2>(queue, nullptr).SetUp(); - Xtbmv<float>(queue, nullptr).SetUp(); Xtbmv<double>(queue, nullptr).SetUp(); Xtbmv<float2>(queue, nullptr).SetUp(); Xtbmv<double2>(queue, nullptr).SetUp(); - Xtpmv<float>(queue, nullptr).SetUp(); Xtpmv<double>(queue, nullptr).SetUp(); Xtpmv<float2>(queue, nullptr).SetUp(); Xtpmv<double2>(queue, nullptr).SetUp(); - Xger<float>(queue, nullptr).SetUp(); Xger<double>(queue, nullptr).SetUp(); - Xgeru<float2>(queue, nullptr).SetUp(); Xgeru<double2>(queue, nullptr).SetUp(); - Xgerc<float2>(queue, nullptr).SetUp(); Xgerc<double2>(queue, nullptr).SetUp(); - Xher<float2,float>(queue, nullptr).SetUp(); Xher<double2,double>(queue, nullptr).SetUp(); - Xhpr<float2,float>(queue, nullptr).SetUp(); Xhpr<double2,double>(queue, nullptr).SetUp(); - Xher2<float2>(queue, nullptr).SetUp(); Xher2<double2>(queue, nullptr).SetUp(); - Xhpr2<float2>(queue, nullptr).SetUp(); Xhpr2<double2>(queue, nullptr).SetUp(); - Xsyr<float>(queue, nullptr).SetUp(); Xsyr<double>(queue, nullptr).SetUp(); - Xspr<float>(queue, nullptr).SetUp(); Xspr<double>(queue, nullptr).SetUp(); - Xsyr2<float>(queue, nullptr).SetUp(); Xsyr2<double>(queue, nullptr).SetUp(); - Xspr2<float>(queue, nullptr).SetUp(); Xspr2<double>(queue, nullptr).SetUp(); + Xgemv<float>(queue, nullptr); Xgemv<double>(queue, nullptr); Xgemv<float2>(queue, nullptr); Xgemv<double2>(queue, nullptr); + Xgbmv<float>(queue, nullptr); Xgbmv<double>(queue, nullptr); Xgbmv<float2>(queue, nullptr); Xgbmv<double2>(queue, nullptr); + Xhemv<float2>(queue, nullptr); Xhemv<double2>(queue, nullptr); + Xhbmv<float2>(queue, nullptr); Xhbmv<double2>(queue, nullptr); + Xhpmv<float2>(queue, nullptr); Xhpmv<double2>(queue, nullptr); + Xsymv<float>(queue, nullptr); Xsymv<double>(queue, nullptr); + Xsbmv<float>(queue, nullptr); Xsbmv<double>(queue, nullptr); + Xspmv<float>(queue, nullptr); Xspmv<double>(queue, nullptr); + Xtrmv<float>(queue, nullptr); Xtrmv<double>(queue, nullptr); Xtrmv<float2>(queue, nullptr); Xtrmv<double2>(queue, nullptr); + Xtbmv<float>(queue, nullptr); Xtbmv<double>(queue, nullptr); Xtbmv<float2>(queue, nullptr); Xtbmv<double2>(queue, nullptr); + Xtpmv<float>(queue, nullptr); Xtpmv<double>(queue, nullptr); Xtpmv<float2>(queue, nullptr); Xtpmv<double2>(queue, nullptr); + Xger<float>(queue, nullptr); Xger<double>(queue, nullptr); + Xgeru<float2>(queue, nullptr); Xgeru<double2>(queue, nullptr); + Xgerc<float2>(queue, nullptr); Xgerc<double2>(queue, nullptr); + Xher<float2,float>(queue, nullptr); Xher<double2,double>(queue, nullptr); + Xhpr<float2,float>(queue, nullptr); Xhpr<double2,double>(queue, nullptr); + Xher2<float2>(queue, nullptr); Xher2<double2>(queue, nullptr); + Xhpr2<float2>(queue, nullptr); Xhpr2<double2>(queue, nullptr); + Xsyr<float>(queue, nullptr); Xsyr<double>(queue, nullptr); + Xspr<float>(queue, nullptr); Xspr<double>(queue, nullptr); + Xsyr2<float>(queue, nullptr); Xsyr2<double>(queue, nullptr); + Xspr2<float>(queue, nullptr); Xspr2<double>(queue, nullptr); // Runs all the level 3 set-up functions - Xgemm<float>(queue, nullptr).SetUp(); Xgemm<double>(queue, nullptr).SetUp(); Xgemm<float2>(queue, nullptr).SetUp(); Xgemm<double2>(queue, nullptr).SetUp(); - Xsymm<float>(queue, nullptr).SetUp(); Xsymm<double>(queue, nullptr).SetUp(); Xsymm<float2>(queue, nullptr).SetUp(); Xsymm<double2>(queue, nullptr).SetUp(); - Xhemm<float2>(queue, nullptr).SetUp(); Xhemm<double2>(queue, nullptr).SetUp(); - Xsyrk<float>(queue, nullptr).SetUp(); Xsyrk<double>(queue, nullptr).SetUp(); Xsyrk<float2>(queue, nullptr).SetUp(); Xsyrk<double2>(queue, nullptr).SetUp(); - Xherk<float2,float>(queue, nullptr).SetUp(); Xherk<double2,double>(queue, nullptr).SetUp(); - Xsyr2k<float>(queue, nullptr).SetUp(); Xsyr2k<double>(queue, nullptr).SetUp(); Xsyr2k<float2>(queue, nullptr).SetUp(); Xsyr2k<double2>(queue, nullptr).SetUp(); - Xher2k<float2,float>(queue, nullptr).SetUp(); Xher2k<double2,double>(queue, nullptr).SetUp(); - Xtrmm<float>(queue, nullptr).SetUp(); Xtrmm<double>(queue, nullptr).SetUp(); Xtrmm<float2>(queue, nullptr).SetUp(); Xtrmm<double2>(queue, nullptr).SetUp(); + Xgemm<float>(queue, nullptr); Xgemm<double>(queue, nullptr); Xgemm<float2>(queue, nullptr); Xgemm<double2>(queue, nullptr); + Xsymm<float>(queue, nullptr); Xsymm<double>(queue, nullptr); Xsymm<float2>(queue, nullptr); Xsymm<double2>(queue, nullptr); + Xhemm<float2>(queue, nullptr); Xhemm<double2>(queue, nullptr); + Xsyrk<float>(queue, nullptr); Xsyrk<double>(queue, nullptr); Xsyrk<float2>(queue, nullptr); Xsyrk<double2>(queue, nullptr); + Xherk<float2,float>(queue, nullptr); Xherk<double2,double>(queue, nullptr); + Xsyr2k<float>(queue, nullptr); Xsyr2k<double>(queue, nullptr); Xsyr2k<float2>(queue, nullptr); Xsyr2k<double2>(queue, nullptr); + Xher2k<float2,float>(queue, nullptr); Xher2k<double2,double>(queue, nullptr); + Xtrmm<float>(queue, nullptr); Xtrmm<double>(queue, nullptr); Xtrmm<float2>(queue, nullptr); Xtrmm<double2>(queue, nullptr); // Runs all the level 3 set-up functions - Xomatcopy<float>(queue, nullptr).SetUp(); Xomatcopy<double>(queue, nullptr).SetUp(); Xomatcopy<float2>(queue, nullptr).SetUp(); Xomatcopy<double2>(queue, nullptr).SetUp(); + Xomatcopy<float>(queue, nullptr); Xomatcopy<double>(queue, nullptr); Xomatcopy<float2>(queue, nullptr); Xomatcopy<double2>(queue, nullptr); - } catch (...) { return StatusCode::kBuildProgramFailure; } + } catch (...) { return DispatchException(); } return StatusCode::kSuccess; } diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp index 9ea2c884..0174fd19 100644 --- a/src/clblast_c.cpp +++ b/src/clblast_c.cpp @@ -31,24 +31,26 @@ StatusCode CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset, cl_mem sc_buffer, const size_t sc_offset, cl_mem ss_buffer, const size_t ss_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotg<float>(sa_buffer, sa_offset, - sb_buffer, sb_offset, - sc_buffer, sc_offset, - ss_buffer, ss_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Rotg<float>(sa_buffer, sa_offset, + sb_buffer, sb_offset, + sc_buffer, sc_offset, + ss_buffer, ss_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, cl_mem sb_buffer, const size_t sb_offset, cl_mem sc_buffer, const size_t sc_offset, cl_mem ss_buffer, const size_t ss_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotg<double>(sa_buffer, sa_offset, - sb_buffer, sb_offset, - sc_buffer, sc_offset, - ss_buffer, ss_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Rotg<double>(sa_buffer, sa_offset, + sb_buffer, sb_offset, + sc_buffer, sc_offset, + ss_buffer, ss_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // ROTMG @@ -58,13 +60,14 @@ StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotmg<float>(sd1_buffer, sd1_offset, - sd2_buffer, sd2_offset, - sx1_buffer, sx1_offset, - sy1_buffer, sy1_offset, - sparam_buffer, sparam_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Rotmg<float>(sd1_buffer, sd1_offset, + sd2_buffer, sd2_offset, + sx1_buffer, sx1_offset, + sy1_buffer, sy1_offset, + sparam_buffer, sparam_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, @@ -72,13 +75,14 @@ StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotmg<double>(sd1_buffer, sd1_offset, - sd2_buffer, sd2_offset, - sx1_buffer, sx1_offset, - sy1_buffer, sy1_offset, - sparam_buffer, sparam_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Rotmg<double>(sd1_buffer, sd1_offset, + sd2_buffer, sd2_offset, + sx1_buffer, sx1_offset, + sy1_buffer, sy1_offset, + sparam_buffer, sparam_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // ROT @@ -88,13 +92,14 @@ StatusCode CLBlastSrot(const size_t n, const float cos, const float sin, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rot(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - cos, - sin, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Rot(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + cos, + sin, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDrot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -102,13 +107,14 @@ StatusCode CLBlastDrot(const size_t n, const double cos, const double sin, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rot(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - cos, - sin, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Rot(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + cos, + sin, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // ROTM @@ -117,24 +123,26 @@ StatusCode CLBlastSrotm(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotm<float>(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - sparam_buffer, sparam_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Rotm<float>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + sparam_buffer, sparam_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDrotm(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotm<double>(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - sparam_buffer, sparam_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Rotm<double>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + sparam_buffer, sparam_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SWAP @@ -142,51 +150,56 @@ StatusCode CLBlastSswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Swap<float>(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Swap<float>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Swap<double>(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Swap<double>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Swap<float2>(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Swap<float2>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Swap<double2>(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Swap<double2>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Swap<half>(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Swap<half>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SCAL @@ -194,51 +207,56 @@ StatusCode CLBlastSscal(const size_t n, const float alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Scal(n, - alpha, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Scal(n, + alpha, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDscal(const size_t n, const double alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Scal(n, - alpha, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Scal(n, + alpha, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCscal(const size_t n, const cl_float2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Scal(n, - float2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Scal(n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZscal(const size_t n, const cl_double2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Scal(n, - double2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Scal(n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHscal(const size_t n, const cl_half alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Scal(n, - alpha, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Scal(n, + alpha, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // COPY @@ -246,51 +264,56 @@ StatusCode CLBlastScopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Copy<float>(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Copy<float>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Copy<double>(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Copy<double>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Copy<float2>(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Copy<float2>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Copy<double2>(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Copy<double2>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Copy<half>(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Copy<half>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // AXPY @@ -299,60 +322,65 @@ StatusCode CLBlastSaxpy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Axpy(n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDaxpy(const size_t n, const double alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Axpy(n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCaxpy(const size_t n, const cl_float2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - float2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Axpy(n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZaxpy(const size_t n, const cl_double2 alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - double2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Axpy(n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHaxpy(const size_t n, const cl_half alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Axpy(n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // DOT @@ -361,36 +389,39 @@ StatusCode CLBlastSdot(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dot<float>(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Dot<float>(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dot<double>(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Dot<double>(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dot<half>(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Dot<half>(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // DOTU @@ -399,24 +430,26 @@ StatusCode CLBlastCdotu(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dotu<float2>(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Dotu<float2>(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZdotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dotu<double2>(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Dotu<double2>(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // DOTC @@ -425,24 +458,26 @@ StatusCode CLBlastCdotc(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dotc<float2>(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Dotc<float2>(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZdotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dotc<double2>(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Dotc<double2>(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // NRM2 @@ -450,51 +485,56 @@ StatusCode CLBlastSnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Nrm2<float>(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Nrm2<float>(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Nrm2<double>(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Nrm2<double>(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastScnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Nrm2<float2>(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Nrm2<float2>(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDznrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Nrm2<double2>(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Nrm2<double2>(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Nrm2<half>(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Nrm2<half>(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // ASUM @@ -502,51 +542,56 @@ StatusCode CLBlastSasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Asum<float>(n, - asum_buffer, asum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Asum<float>(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Asum<double>(n, - asum_buffer, asum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Asum<double>(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastScasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Asum<float2>(n, - asum_buffer, asum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Asum<float2>(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDzasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Asum<double2>(n, - asum_buffer, asum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Asum<double2>(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Asum<half>(n, - asum_buffer, asum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Asum<half>(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SUM @@ -554,51 +599,56 @@ StatusCode CLBlastSsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sum<float>(n, - sum_buffer, sum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Sum<float>(n, + sum_buffer, sum_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sum<double>(n, - sum_buffer, sum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Sum<double>(n, + sum_buffer, sum_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastScsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sum<float2>(n, - sum_buffer, sum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Sum<float2>(n, + sum_buffer, sum_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDzsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sum<double2>(n, - sum_buffer, sum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Sum<double2>(n, + sum_buffer, sum_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sum<half>(n, - sum_buffer, sum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Sum<half>(n, + sum_buffer, sum_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // AMAX @@ -606,51 +656,56 @@ StatusCode CLBlastiSamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Amax<float>(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Amax<float>(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastiDamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Amax<double>(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Amax<double>(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastiCamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Amax<float2>(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Amax<float2>(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastiZamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Amax<double2>(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Amax<double2>(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastiHamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Amax<half>(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Amax<half>(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // MAX @@ -658,51 +713,56 @@ StatusCode CLBlastiSmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Max<float>(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Max<float>(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastiDmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Max<double>(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Max<double>(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastiCmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Max<float2>(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Max<float2>(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastiZmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Max<double2>(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Max<double2>(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastiHmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Max<half>(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Max<half>(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // MIN @@ -710,51 +770,56 @@ StatusCode CLBlastiSmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Min<float>(n, - imin_buffer, imin_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Min<float>(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastiDmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Min<double>(n, - imin_buffer, imin_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Min<double>(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastiCmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Min<float2>(n, - imin_buffer, imin_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Min<float2>(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastiZmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Min<double2>(n, - imin_buffer, imin_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Min<double2>(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastiHmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Min<half>(n, - imin_buffer, imin_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Min<half>(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // ================================================================================================= @@ -770,16 +835,17 @@ StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gemv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -789,16 +855,17 @@ StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gemv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -808,16 +875,17 @@ StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - float2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gemv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -827,16 +895,17 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - double2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gemv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -846,16 +915,17 @@ StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gemv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // GBMV @@ -867,16 +937,17 @@ StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, kl, ku, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, @@ -886,16 +957,17 @@ StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, kl, ku, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, @@ -905,16 +977,17 @@ StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, kl, ku, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - float2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, kl, ku, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, @@ -924,16 +997,17 @@ StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, kl, ku, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - double2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, kl, ku, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, @@ -943,16 +1017,17 @@ StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, kl, ku, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // HEMV @@ -964,16 +1039,17 @@ StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hemv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - float2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Hemv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, const size_t n, @@ -983,16 +1059,17 @@ StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hemv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - double2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Hemv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // HBMV @@ -1004,16 +1081,17 @@ StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hbmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, k, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - float2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Hbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, @@ -1023,16 +1101,17 @@ StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hbmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, k, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - double2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Hbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // HPMV @@ -1044,16 +1123,17 @@ StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, const cl_float2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hpmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - float2{alpha.s[0], alpha.s[1]}, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - float2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Hpmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, const size_t n, @@ -1063,16 +1143,17 @@ StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hpmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - double2{alpha.s[0], alpha.s[1]}, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - double2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Hpmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SYMV @@ -1084,16 +1165,17 @@ StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Symv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, const size_t n, @@ -1103,16 +1185,17 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Symv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle, const size_t n, @@ -1122,16 +1205,17 @@ StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Symv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SBMV @@ -1143,16 +1227,17 @@ StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sbmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, k, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Sbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, @@ -1162,16 +1247,17 @@ StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sbmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, k, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Sbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, @@ -1181,16 +1267,17 @@ StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sbmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, k, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Sbmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SPMV @@ -1202,16 +1289,17 @@ StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, const float beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Spmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, const size_t n, @@ -1221,16 +1309,17 @@ StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Spmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle, const size_t n, @@ -1240,16 +1329,17 @@ StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle, const cl_half beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spmv(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Spmv(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // TRMV @@ -1258,75 +1348,80 @@ StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Tran const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmv<float>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trmv<float>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmv<double>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trmv<double>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmv<float2>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trmv<float2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmv<double2>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trmv<double2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmv<half>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trmv<half>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // TBMV @@ -1335,75 +1430,80 @@ StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Tran const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbmv<float>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tbmv<float>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbmv<double>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tbmv<double>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbmv<float2>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tbmv<float2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbmv<double2>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tbmv<double2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbmv<half>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tbmv<half>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // TPMV @@ -1412,75 +1512,80 @@ StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Tran const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpmv<float>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tpmv<float>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpmv<double>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tpmv<double>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpmv<float2>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tpmv<float2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpmv<double2>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tpmv<double2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpmv<half>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tpmv<half>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // TRSV @@ -1489,60 +1594,64 @@ StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Tran const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsv<float>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trsv<float>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsv<double>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trsv<double>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsv<float2>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trsv<float2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsv<double2>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trsv<double2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // TBSV @@ -1551,60 +1660,64 @@ StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Tran const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbsv<float>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tbsv<float>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbsv<double>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tbsv<double>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbsv<float2>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tbsv<float2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbsv<double2>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tbsv<double2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // TPSV @@ -1613,60 +1726,64 @@ StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Tran const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpsv<float>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tpsv<float>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpsv<double>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tpsv<double>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpsv<float2>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tpsv<float2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpsv<double2>(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Tpsv<double2>(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // GER @@ -1677,14 +1794,15 @@ StatusCode CLBlastSger(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Ger(static_cast<clblast::Layout>(layout), - m, n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Ger(static_cast<clblast::Layout>(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDger(const Layout layout, const size_t m, const size_t n, @@ -1693,14 +1811,15 @@ StatusCode CLBlastDger(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Ger(static_cast<clblast::Layout>(layout), - m, n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Ger(static_cast<clblast::Layout>(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHger(const Layout layout, const size_t m, const size_t n, @@ -1709,14 +1828,15 @@ StatusCode CLBlastHger(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Ger(static_cast<clblast::Layout>(layout), - m, n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Ger(static_cast<clblast::Layout>(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // GERU @@ -1727,14 +1847,15 @@ StatusCode CLBlastCgeru(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Geru(static_cast<clblast::Layout>(layout), - m, n, - float2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Geru(static_cast<clblast::Layout>(layout), + m, n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZgeru(const Layout layout, const size_t m, const size_t n, @@ -1743,14 +1864,15 @@ StatusCode CLBlastZgeru(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Geru(static_cast<clblast::Layout>(layout), - m, n, - double2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Geru(static_cast<clblast::Layout>(layout), + m, n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // GERC @@ -1761,14 +1883,15 @@ StatusCode CLBlastCgerc(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gerc(static_cast<clblast::Layout>(layout), - m, n, - float2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gerc(static_cast<clblast::Layout>(layout), + m, n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZgerc(const Layout layout, const size_t m, const size_t n, @@ -1777,14 +1900,15 @@ StatusCode CLBlastZgerc(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gerc(static_cast<clblast::Layout>(layout), - m, n, - double2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gerc(static_cast<clblast::Layout>(layout), + m, n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // HER @@ -1794,14 +1918,15 @@ StatusCode CLBlastCher(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Her(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Her(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZher(const Layout layout, const Triangle triangle, const size_t n, @@ -1809,14 +1934,15 @@ StatusCode CLBlastZher(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Her(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Her(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // HPR @@ -1826,14 +1952,15 @@ StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hpr(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Hpr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, const size_t n, @@ -1841,14 +1968,15 @@ StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hpr(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Hpr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // HER2 @@ -1859,15 +1987,16 @@ StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Her2(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - float2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Her2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, const size_t n, @@ -1876,15 +2005,16 @@ StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Her2(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - double2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Her2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // HPR2 @@ -1895,15 +2025,16 @@ StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hpr2(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - float2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Hpr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, const size_t n, @@ -1912,15 +2043,16 @@ StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hpr2(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - double2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Hpr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SYR @@ -1930,14 +2062,15 @@ StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, const size_t n, @@ -1945,14 +2078,15 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle, const size_t n, @@ -1960,14 +2094,15 @@ StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SPR @@ -1977,14 +2112,15 @@ StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spr(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Spr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, const size_t n, @@ -1992,14 +2128,15 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spr(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Spr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHspr(const Layout layout, const Triangle triangle, const size_t n, @@ -2007,14 +2144,15 @@ StatusCode CLBlastHspr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spr(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Spr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SYR2 @@ -2025,15 +2163,16 @@ StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, const size_t n, @@ -2042,15 +2181,16 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle, const size_t n, @@ -2059,15 +2199,16 @@ StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SPR2 @@ -2078,15 +2219,16 @@ StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spr2(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Spr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, const size_t n, @@ -2095,15 +2237,16 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spr2(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Spr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle, const size_t n, @@ -2112,15 +2255,16 @@ StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spr2(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Spr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // ================================================================================================= @@ -2136,17 +2280,18 @@ StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Transpose>(b_transpose), - m, n, k, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gemm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Transpose>(b_transpose), + m, n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -2156,17 +2301,18 @@ StatusCode CLBlastDgemm(const Layout layout, const Transpose a_transpose, const const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Transpose>(b_transpose), - m, n, k, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gemm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Transpose>(b_transpose), + m, n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -2176,17 +2322,18 @@ StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Transpose>(b_transpose), - m, n, k, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - float2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gemm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Transpose>(b_transpose), + m, n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + float2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -2196,17 +2343,18 @@ StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Transpose>(b_transpose), - m, n, k, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - double2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gemm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Transpose>(b_transpose), + m, n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + double2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -2216,17 +2364,18 @@ StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Transpose>(b_transpose), - m, n, k, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Gemm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Transpose>(b_transpose), + m, n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SYMM @@ -2238,17 +2387,18 @@ StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle tri const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Symm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -2258,17 +2408,18 @@ StatusCode CLBlastDsymm(const Layout layout, const Side side, const Triangle tri const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Symm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -2278,17 +2429,18 @@ StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle tri const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - m, n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - float2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Symm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + float2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -2298,17 +2450,18 @@ StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle tri const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - m, n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - double2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Symm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + double2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -2318,17 +2471,18 @@ StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle tri const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Symm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // HEMM @@ -2340,17 +2494,18 @@ StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle tri const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hemm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - m, n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - float2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Hemm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + float2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -2360,17 +2515,18 @@ StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle tri const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hemm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - m, n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - double2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Hemm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + double2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SYRK @@ -2381,16 +2537,17 @@ StatusCode CLBlastSsyrk(const Layout layout, const Triangle triangle, const Tran const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syrk(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syrk(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, @@ -2399,16 +2556,17 @@ StatusCode CLBlastDsyrk(const Layout layout, const Triangle triangle, const Tran const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syrk(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syrk(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, @@ -2417,16 +2575,17 @@ StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Tran const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syrk(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - n, k, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - float2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syrk(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + float2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, @@ -2435,16 +2594,17 @@ StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Tran const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syrk(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - n, k, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - double2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syrk(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + double2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, @@ -2453,16 +2613,17 @@ StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Tran const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syrk(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syrk(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // HERK @@ -2473,16 +2634,17 @@ StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Tran const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Herk(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Herk(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, @@ -2491,16 +2653,17 @@ StatusCode CLBlastZherk(const Layout layout, const Triangle triangle, const Tran const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Herk(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Herk(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // SYR2K @@ -2512,17 +2675,18 @@ StatusCode CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Tra const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(ab_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syr2k(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(ab_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, @@ -2532,17 +2696,18 @@ StatusCode CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Tra const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(ab_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syr2k(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(ab_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, @@ -2552,17 +2717,18 @@ StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Tra const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(ab_transpose), - n, k, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - float2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syr2k(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(ab_transpose), + n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + float2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, @@ -2572,17 +2738,18 @@ StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Tra const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(ab_transpose), - n, k, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - double2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syr2k(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(ab_transpose), + n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + double2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, @@ -2592,17 +2759,18 @@ StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Tra const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(ab_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Syr2k(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(ab_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // HER2K @@ -2614,17 +2782,18 @@ StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Tra const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Her2k(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(ab_transpose), - n, k, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Her2k(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(ab_transpose), + n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, @@ -2634,17 +2803,18 @@ StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Tra const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Her2k(static_cast<clblast::Layout>(layout), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(ab_transpose), - n, k, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Her2k(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(ab_transpose), + n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // TRMM @@ -2654,17 +2824,18 @@ StatusCode CLBlastStrmm(const Layout layout, const Side side, const Triangle tri const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trmm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, @@ -2672,17 +2843,18 @@ StatusCode CLBlastDtrmm(const Layout layout, const Side side, const Triangle tri const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trmm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, @@ -2690,17 +2862,18 @@ StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle tri const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - m, n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trmm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, @@ -2708,17 +2881,18 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - m, n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trmm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, @@ -2726,17 +2900,18 @@ StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle tri const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trmm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // TRSM @@ -2746,17 +2921,18 @@ StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle tri const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trsm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, @@ -2764,17 +2940,18 @@ StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle tri const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trsm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, @@ -2782,17 +2959,18 @@ StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle tri const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - m, n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trsm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, @@ -2800,17 +2978,18 @@ StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle tri const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - m, n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trsm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, @@ -2818,17 +2997,18 @@ StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle tri const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsm(static_cast<clblast::Layout>(layout), - static_cast<clblast::Side>(side), - static_cast<clblast::Triangle>(triangle), - static_cast<clblast::Transpose>(a_transpose), - static_cast<clblast::Diagonal>(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Trsm(static_cast<clblast::Layout>(layout), + static_cast<clblast::Side>(side), + static_cast<clblast::Triangle>(triangle), + static_cast<clblast::Transpose>(a_transpose), + static_cast<clblast::Diagonal>(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // ================================================================================================= @@ -2842,14 +3022,15 @@ StatusCode CLBlastSomatcopy(const Layout layout, const Transpose a_transpose, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Omatcopy(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -2857,14 +3038,15 @@ StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Omatcopy(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -2872,14 +3054,15 @@ StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Omatcopy(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -2887,14 +3070,15 @@ StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Omatcopy(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -2902,26 +3086,31 @@ StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout), - static_cast<clblast::Transpose>(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast<StatusCode>(status); + try { + return static_cast<StatusCode>(clblast::Omatcopy(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // ================================================================================================= // Clears the cache of stored binaries StatusCode CLBlastClearCache() { - return static_cast<StatusCode>(clblast::ClearCache()); + try { + return static_cast<StatusCode>(clblast::ClearCache()); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // Fills the cache with binaries for a specific device StatusCode CLBlastFillCache(const cl_device_id device) { - return static_cast<StatusCode>(clblast::FillCache(device)); + try { + return static_cast<StatusCode>(clblast::FillCache(device)); + } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); } } // ================================================================================================= diff --git a/src/clblast_exceptions.cpp b/src/clblast_exceptions.cpp new file mode 100644 index 00000000..68d31e46 --- /dev/null +++ b/src/clblast_exceptions.cpp @@ -0,0 +1,95 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Ivan Shapovalov <intelfx@intelfx.name> +// +// This file implements the exception hierarchy for CLBlast. It contains classes for exceptions +// generated by different parts of CLBlast (e.g. OpenCL API calls, internal logic, semantic BLAS +// errors). +// +// ================================================================================================= + +#include "clblast_exceptions.hpp" + +namespace { +// ================================================================================================= + +std::string MakeReason(const std::string &reason, const std::string &subreason) { + std::string r = reason; + if (!subreason.empty()) { + r += " (" + subreason + ")"; + } + return r; +} + +} // anonymous namespace + +namespace clblast { +// ================================================================================================= + +BLASError::BLASError(StatusCode status, const std::string &subreason): + ErrorCode(status, + subreason, + "BLAS error: " + MakeReason(std::to_string(static_cast<int>(status)), subreason)) { +} + +RuntimeErrorCode::RuntimeErrorCode(StatusCode status, const std::string &subreason): + ErrorCode(status, + subreason, + MakeReason(std::to_string(static_cast<int>(status)), subreason)) { +} + +// ================================================================================================= + +StatusCode DispatchException() +{ + const char *message = nullptr; + StatusCode status; + + try { + throw; + } catch (BLASError &e) { + // no message is printed for invalid argument errors + status = e.status(); + } catch (CLError &e) { + message = e.what(); + status = static_cast<StatusCode>(e.status()); + } catch (RuntimeErrorCode &e) { + message = e.what(); + status = e.status(); + } catch (Error<std::runtime_error> &e) { + message = e.what(); + status = StatusCode::kUnknownError; + } + + if (message) { + fprintf(stderr, "CLBlast: %s\n", message); + } + return status; +} + +// ================================================================================================= + +StatusCode DispatchExceptionForC() +{ + const char *message = nullptr; + + try { + throw; + } catch (std::exception &e) { + message = e.what(); + } catch (...) { + message = "unknown exception"; + } + + fprintf (stderr, "CLBlast (unexpected): %s\n", message); + return StatusCode::kUnexpectedError; +} + +// ================================================================================================= + +} // namespace clblast diff --git a/src/clblast_exceptions.hpp b/src/clblast_exceptions.hpp new file mode 100644 index 00000000..89f5e761 --- /dev/null +++ b/src/clblast_exceptions.hpp @@ -0,0 +1,50 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Ivan Shapovalov <intelfx@intelfx.name> +// +// This file implements the exception hierarchy for CLBlast. It contains classes for exceptions +// generated by different parts of CLBlast (e.g. OpenCL API calls, internal logic, semantic BLAS +// errors). +// +// ================================================================================================= + +#ifndef CLBLAST_EXCEPTIONS_H_ +#define CLBLAST_EXCEPTIONS_H_ + +#include "clblast.h" +#include "clpp11.hpp" + +namespace clblast { +// ================================================================================================= + +// Represents a semantic error in BLAS function arguments +class PUBLIC_API BLASError : public ErrorCode<Error<std::invalid_argument>, StatusCode> { + public: + explicit BLASError(StatusCode status, const std::string &subreason = std::string{}); +}; +// ================================================================================================= + +// Represents a runtime error generated by internal logic +class PUBLIC_API RuntimeErrorCode : public ErrorCode<RuntimeError, StatusCode> { + public: + explicit RuntimeErrorCode(StatusCode status, const std::string &subreason = std::string{}); +}; + +// ================================================================================================= + +// Handles (most of the) runtime exceptions and converts them to StatusCode +StatusCode DispatchException(); + +// Handles remaining exceptions and converts them to StatusCode::kUnhandledError +StatusCode DispatchExceptionForC(); + +// ================================================================================================= + +} // namespace clblast + +#endif // CLBLAST_EXCEPTIONS_H_ diff --git a/src/clpp11.hpp b/src/clpp11.hpp index aaa76cb4..d306bb87 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -41,8 +41,8 @@ #include <string> // std::string #include <vector> // std::vector #include <memory> // std::shared_ptr -#include <stdexcept> // std::runtime_error #include <numeric> // std::accumulate +#include <cstring> // std::strlen // OpenCL #if defined(__APPLE__) || defined(__MACOSX) @@ -51,20 +51,41 @@ #include <CL/opencl.h> #endif +// Exception classes +#include "cxpp11_common.hpp" + namespace clblast { // ================================================================================================= -// Error occurred in the C++11 OpenCL header (this file) -inline void Error(const std::string &message) { - throw std::runtime_error("Internal OpenCL error: "+message); -} +// Represents a runtime error returned by an OpenCL API function +class CLError : public ErrorCode<DeviceError, cl_int> { + public: + explicit CLError(cl_int status, const std::string &where): + ErrorCode(status, + where, + "OpenCL error: " + where + ": " + std::to_string(static_cast<int>(status))) { + } -// Error occurred in OpenCL -inline void CheckError(const cl_int status) { - if (status != CL_SUCCESS) { - throw std::runtime_error("Internal OpenCL error: "+std::to_string(status)); + static void Check(const cl_int status, const std::string &where) { + if (status != CL_SUCCESS) { + throw CLError(status, where); + } } -} + + static void CheckDtor(const cl_int status, const std::string &where) { + if (status != CL_SUCCESS) { + fprintf(stderr, "CLBlast: %s (ignoring)\n", CLError(status, where).what()); + } + } +}; + +// ================================================================================================= + +// Error occurred in OpenCL +#define CheckError(call) CLError::Check(call, CLError::TrimCallString(#call)) + +// Error occured in OpenCL (no-exception version for destructors) +#define CheckErrorDtor(call) CLError::CheckDtor(call, CLError::TrimCallString(#call)) // ================================================================================================= @@ -81,7 +102,7 @@ class Event { // Regular constructor with memory management explicit Event(): event_(new cl_event, [](cl_event* e) { - if (*e) { CheckError(clReleaseEvent(*e)); } + if (*e) { CheckErrorDtor(clReleaseEvent(*e)); } delete e; }) { *event_ = nullptr; @@ -92,16 +113,17 @@ class Event { CheckError(clWaitForEvents(1, &(*event_))); } - // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on - // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation: - // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx + // Retrieves the elapsed time of the last recorded event. + // (Note that there is a bug in Apple's OpenCL implementation of the 'clGetEventProfilingInfo' function: + // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx) + // However, in our case the reply size is fixed to be cl_ulong, so we are not affected. float GetElapsedTime() const { WaitForCompletion(); const auto bytes = sizeof(cl_ulong); auto time_start = cl_ulong{0}; - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); + CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr)); auto time_end = cl_ulong{0}; - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); + CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr)); return static_cast<float>(time_end - time_start) * 1.0e-6f; } @@ -130,10 +152,14 @@ class Platform { explicit Platform(const size_t platform_id) { auto num_platforms = cl_uint{0}; CheckError(clGetPlatformIDs(0, nullptr, &num_platforms)); - if (num_platforms == 0) { Error("no platforms found"); } + if (num_platforms == 0) { + throw RuntimeError("Platform: no platforms found"); + } + if (platform_id >= num_platforms) { + throw RuntimeError("Platform: invalid platform ID "+std::to_string(platform_id)); + } auto platforms = std::vector<cl_platform_id>(num_platforms); CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr)); - if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); } platform_ = platforms[platform_id]; } @@ -173,11 +199,16 @@ class Device { // Initialize the device. Note that this constructor can throw exceptions! explicit Device(const Platform &platform, const size_t device_id) { auto num_devices = platform.NumDevices(); - if (num_devices == 0) { Error("no devices found"); } + if (num_devices == 0) { + throw RuntimeError("Device: no devices found"); + } + if (device_id >= num_devices) { + throw RuntimeError("Device: invalid device ID "+std::to_string(device_id)); + } + auto devices = std::vector<cl_device_id>(num_devices); CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices), devices.data(), nullptr)); - if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); } device_ = devices[device_id]; } @@ -282,7 +313,8 @@ class Device { auto result = std::string{}; result.resize(bytes); CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr)); - return std::string{result.c_str()}; // Removes any trailing '\0'-characters + result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters + return result; } }; @@ -300,11 +332,11 @@ class Context { // Regular constructor with memory management explicit Context(const Device &device): - context_(new cl_context, [](cl_context* c) { CheckError(clReleaseContext(*c)); delete c; }) { + context_(new cl_context, [](cl_context* c) { CheckErrorDtor(clReleaseContext(*c)); delete c; }) { auto status = CL_SUCCESS; const cl_device_id dev = device(); *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status); - CheckError(status); + CLError::Check(status, "clCreateContext"); } // Accessor to the private data-member @@ -329,18 +361,18 @@ class Program { // Source-based constructor with memory management explicit Program(const Context &context, std::string source): - program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), + program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }), length_(source.length()), source_(std::move(source)), source_ptr_(&source_[0]) { auto status = CL_SUCCESS; *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status); - CheckError(status); + CLError::Check(status, "clCreateProgramWithSource"); } // Binary-based constructor with memory management explicit Program(const Device &device, const Context &context, const std::string& binary): - program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), + program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }), length_(binary.length()), source_(binary), source_ptr_(&source_[0]) { @@ -350,25 +382,15 @@ class Program { *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_, reinterpret_cast<const unsigned char**>(&source_ptr_), &status1, &status2); - CheckError(status1); - CheckError(status2); + CLError::Check(status1, "clCreateProgramWithBinary (binary status)"); + CLError::Check(status2, "clCreateProgramWithBinary"); } // Compiles the device program and returns whether or not there where any warnings/errors - BuildStatus Build(const Device &device, std::vector<std::string> &options) { + void Build(const Device &device, std::vector<std::string> &options) { auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "}); const cl_device_id dev = device(); - auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr); - if (status == CL_BUILD_PROGRAM_FAILURE) { - return BuildStatus::kError; - } - else if (status == CL_INVALID_BINARY) { - return BuildStatus::kInvalid; - } - else { - CheckError(status); - return BuildStatus::kSuccess; - } + CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr)); } // Retrieves the warning/error message from the compiler (if any) @@ -416,7 +438,7 @@ class Queue { // Regular constructor with memory management explicit Queue(const Context &context, const Device &device): - queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s)); + queue_(new cl_command_queue, [](cl_command_queue* s) { CheckErrorDtor(clReleaseCommandQueue(*s)); delete s; }) { auto status = CL_SUCCESS; #ifdef CL_VERSION_2_0 @@ -425,15 +447,17 @@ class Queue { { cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status); + CLError::Check(status, "clCreateCommandQueueWithProperties"); } else { *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); + CLError::Check(status, "clCreateCommandQueue"); } #else *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); + CLError::Check(status, "clCreateCommandQueue"); #endif - CheckError(status); } // Synchronizes the queue @@ -525,7 +549,7 @@ class Buffer { if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; } auto status = CL_SUCCESS; *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status); - CheckError(status); + CLError::Check(status, "clCreateBuffer"); } // As above, but now with read/write access as a default @@ -546,18 +570,24 @@ class Buffer { // Copies from device to host: reading the device buffer a-synchronously void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { - if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); } + if (access_ == BufferAccess::kWriteOnly) { + throw LogicError("Buffer: reading from a write-only buffer"); + } CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T), host, 0, nullptr, nullptr)); } void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) const { - if (host.size() < size) { Error("target host buffer is too small"); } + if (host.size() < size) { + throw LogicError("Buffer: target host buffer is too small"); + } ReadAsync(queue, size, host.data(), offset); } void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) const { - if (host.size() < size) { Error("target host buffer is too small"); } + if (host.size() < size) { + throw LogicError("Buffer: target host buffer is too small"); + } ReadAsync(queue, size, host.data(), offset); } @@ -577,8 +607,12 @@ class Buffer { // Copies from host to device: writing the device buffer a-synchronously void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { - if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); } - if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); } + if (access_ == BufferAccess::kReadOnly) { + throw LogicError("Buffer: writing to a read-only buffer"); + } + if (GetSize() < (offset+size)*sizeof(T)) { + throw LogicError("Buffer: target device buffer is too small"); + } CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T), host, 0, nullptr, nullptr)); } @@ -644,10 +678,10 @@ class Kernel { // Regular constructor with memory management explicit Kernel(const Program &program, const std::string &name): - kernel_(new cl_kernel, [](cl_kernel* k) { CheckError(clReleaseKernel(*k)); delete k; }) { + kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) { auto status = CL_SUCCESS; *kernel_ = clCreateKernel(program(), name.c_str(), &status); - CheckError(status); + CLError::Check(status, "clCreateKernel"); } // Sets a kernel argument at the indicated position diff --git a/src/cxpp11_common.hpp b/src/cxpp11_common.hpp new file mode 100644 index 00000000..c164ec1d --- /dev/null +++ b/src/cxpp11_common.hpp @@ -0,0 +1,87 @@ +#ifndef CLBLAST_CXPP11_COMMON_H_ +#define CLBLAST_CXPP11_COMMON_H_ + +// C++ +#include <string> // std::string +#include <stdexcept> // std::runtime_error + +namespace clblast { +// ================================================================================================= + +// Basic exception class: represents an error happened inside our code +// (as opposed to an error in C++ runtime) +template <typename Base> +class Error : public Base { + public: + using Base::Base; +}; + +// ================================================================================================= + +// Represents a generic device-specific runtime error (returned by an OpenCL or CUDA API function) +class DeviceError : public Error<std::runtime_error> { + public: + using Error<std::runtime_error>::Error; + + static std::string TrimCallString(const char *where) { + const char *paren = strchr(where, '('); + if (paren) { + return std::string(where, paren); + } else { + return std::string(where); + } + } +}; + +// ================================================================================================= + +// Represents a generic runtime error (aka environmental problem) +class RuntimeError : public Error<std::runtime_error> { + public: + explicit RuntimeError(const std::string &reason): + Error("Run-time error: " + reason) { + } +}; + +// ================================================================================================= + +// Represents a generic logic error (aka failed assertion) +class LogicError : public Error<std::logic_error> { + public: + explicit LogicError(const std::string &reason): + Error("Internal logic error: " + reason) { + } +}; + +// ================================================================================================= + +// Internal exception base class with a status field and a subclass-specific "details" field +// which can be used to recreate an exception +template <typename Base, typename Status> +class ErrorCode : public Base { + public: + ErrorCode(Status status, const std::string &details, const std::string &reason): + Base(reason), + status_(status), + details_(details) { + } + + Status status() const { + return status_; + } + + const std::string& details() const { + return details_; + } + + private: + const Status status_; + const std::string details_; +}; + +// ================================================================================================= + +} // namespace clblast + +// CLBLAST_CXPP11_COMMON_H_ +#endif diff --git a/src/database/database.cpp b/src/database/database.cpp index 2340a89c..9b8537c2 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -92,7 +92,7 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels, } } - if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); } + if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); } } } diff --git a/src/routine.cpp b/src/routine.cpp index 80764b74..acafb0d2 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -21,10 +21,11 @@ namespace clblast { // ================================================================================================= -// Constructor: not much here, because no status codes can be returned +// The constructor does all heavy work, errors are returned as exceptions Routine::Routine(Queue &queue, EventPointer event, const std::string &name, const std::vector<std::string> &routines, const Precision precision, - const std::vector<const Database::DatabaseEntry*> &userDatabase): + const std::vector<const Database::DatabaseEntry*> &userDatabase, + std::initializer_list<const char *> source): precision_(precision), routine_name_(name), queue_(queue), @@ -33,15 +34,9 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name, device_(queue_.GetDevice()), device_name_(device_.Name()), db_(queue_, routines, precision_, userDatabase) { -} - -// ================================================================================================= - -// Separate set-up function to allow for status codes to be returned -StatusCode Routine::SetUp() { // Queries the cache to see whether or not the program (context-specific) is already there - if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; } + if (ProgramIsInCache(context_, precision_, routine_name_)) { return; } // Sets the build options from an environmental variable (if set) auto options = std::vector<std::string>(); @@ -53,13 +48,10 @@ StatusCode Routine::SetUp() { // Queries the cache to see whether or not the binary (device-specific) is already there. If it // is, a program is created and stored in the cache if (BinaryIsInCache(device_name_, precision_, routine_name_)) { - try { - auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_); - auto program = Program(device_, context_, binary); - program.Build(device_, options); - StoreProgramToCache(program, context_, precision_, routine_name_); - } catch (...) { return StatusCode::kBuildProgramFailure; } - return StatusCode::kSuccess; + auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_); + auto program = Program(device_, context_, binary); + program.Build(device_, options); + StoreProgramToCache(program, context_, precision_, routine_name_); } // Otherwise, the kernel will be compiled and program will be built. Both the binary and the @@ -69,48 +61,50 @@ StatusCode Routine::SetUp() { const auto extensions = device_.Capabilities(); if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) { if (extensions.find(kKhronosDoublePrecision) == std::string::npos) { - return StatusCode::kNoDoublePrecision; + throw RuntimeErrorCode(StatusCode::kNoDoublePrecision); } } // As above, but for cl_khr_fp16 (half precision) if (precision_ == Precision::kHalf) { if (extensions.find(kKhronosHalfPrecision) == std::string::npos) { - return StatusCode::kNoHalfPrecision; + throw RuntimeErrorCode(StatusCode::kNoHalfPrecision); } } - // Loads the common header (typedefs and defines and such) - std::string common_header = - #include "kernels/common.opencl" - ; - // Collects the parameters for this device in the form of defines, and adds the precision - auto defines = db_.GetDefines(); - defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n"; + auto source_string = db_.GetDefines(); + source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n"; // Adds the name of the routine as a define - defines += "#define ROUTINE_"+routine_name_+"\n"; + source_string += "#define ROUTINE_"+routine_name_+"\n"; // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve // performance, but might result in a reduced accuracy. if (device_.IsAMD() && device_.IsGPU()) { - defines += "#define USE_CL_MAD 1\n"; + source_string += "#define USE_CL_MAD 1\n"; } // For specific devices, use staggered/shuffled workgroup indices. if (device_.IsAMD() && device_.IsGPU()) { - defines += "#define USE_STAGGERED_INDICES 1\n"; + source_string += "#define USE_STAGGERED_INDICES 1\n"; } // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize // performance through better cache behaviour if (device_.IsARM() && device_.IsGPU()) { - defines += "#define GLOBAL_MEM_FENCE 1\n"; + source_string += "#define GLOBAL_MEM_FENCE 1\n"; } - // Combines everything together into a single source string - const auto source_string = defines + common_header + source_string_; + // Loads the common header (typedefs and defines and such) + source_string += + #include "kernels/common.opencl" + ; + + // Adds routine-specific code to the constructed source string + for (const char *s: source) { + source_string += s; + } // Prints details of the routine to compile in case of debugging in verbose mode #ifdef VERBOSE @@ -120,23 +114,21 @@ StatusCode Routine::SetUp() { #endif // Compiles the kernel + auto program = Program(context_, source_string); try { - auto program = Program(context_, source_string); - const auto build_status = program.Build(device_, options); - - // Checks for compiler crashes/errors/warnings - if (build_status == BuildStatus::kError) { - const auto message = program.GetBuildInfo(device_); - fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str()); - return StatusCode::kBuildProgramFailure; + program.Build(device_, options); + } catch (const CLError &e) { + if (e.status() == CL_BUILD_PROGRAM_FAILURE) { + fprintf(stdout, "OpenCL compiler error/warning: %s\n", + program.GetBuildInfo(device_).c_str()); } - if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; } + throw; + } - // Store the compiled binary and program in the cache - const auto binary = program.GetIR(); - StoreBinaryToCache(binary, device_name_, precision_, routine_name_); - StoreProgramToCache(program, context_, precision_, routine_name_); - } catch (...) { return StatusCode::kBuildProgramFailure; } + // Store the compiled binary and program in the cache + const auto binary = program.GetIR(); + StoreBinaryToCache(binary, device_name_, precision_, routine_name_); + StoreProgramToCache(program, context_, precision_, routine_name_); // Prints the elapsed compilation time in case of debugging in verbose mode #ifdef VERBOSE @@ -144,9 +136,6 @@ StatusCode Routine::SetUp() { const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count(); printf("[DEBUG] Completed compilation in %.2lf ms\n", timing); #endif - - // No errors, normal termination of this function - return StatusCode::kSuccess; } // ================================================================================================= diff --git a/src/routine.hpp b/src/routine.hpp index 8582a2b7..f4ad435e 100644 --- a/src/routine.hpp +++ b/src/routine.hpp @@ -34,21 +34,19 @@ class Routine { // Base class constructor. The user database is an optional extra database to override the // built-in database. + // All heavy preparation work is done inside this constructor. explicit Routine(Queue &queue, EventPointer event, const std::string &name, const std::vector<std::string> &routines, const Precision precision, - const std::vector<const Database::DatabaseEntry*> &userDatabase = {}); - - // Set-up phase of the kernel - StatusCode SetUp(); + const std::vector<const Database::DatabaseEntry*> &userDatabase, + std::initializer_list<const char *> source); protected: // Non-static variable for the precision const Precision precision_; - // The routine's name and its kernel-source in string form + // The routine's name const std::string routine_name_; - std::string source_string_; // The OpenCL objects, accessible only from derived classes Queue queue_; diff --git a/src/routines/common.cpp b/src/routines/common.cpp index 3969cf9f..c995dc12 100644 --- a/src/routines/common.cpp +++ b/src/routines/common.cpp @@ -20,22 +20,26 @@ namespace clblast { // ================================================================================================= // Enqueues a kernel, waits for completion, and checks for errors -StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, - std::vector<size_t> global, const std::vector<size_t> &local, - EventPointer event, const std::vector<Event> &waitForEvents) { +void RunKernel(Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local, + EventPointer event, const std::vector<Event> &waitForEvents) { if (!local.empty()) { // Tests for validity of the local thread sizes if (local.size() > device.MaxWorkItemDimensions()) { - return StatusCode::kInvalidLocalNumDimensions; + throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions); } const auto max_work_item_sizes = device.MaxWorkItemSizes(); for (auto i=size_t{0}; i<local.size(); ++i) { - if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; } + if (local[i] > max_work_item_sizes[i]) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim); + } } auto local_size = size_t{1}; for (auto &item: local) { local_size *= item; } - if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; } + if (local_size > device.MaxWorkGroupSize()) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal); + } // Make sure the global thread sizes are at least equal to the local sizes for (auto i=size_t{0}; i<global.size(); ++i) { @@ -45,7 +49,9 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, // Tests for local memory usage const auto local_mem_usage = kernel.LocalMemUsage(device); - if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; } + if (!device.IsLocalMemoryValid(local_mem_usage)) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage); + } // Prints the name of the kernel to launch in case of debugging in verbose mode #ifdef VERBOSE @@ -55,9 +61,7 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, #endif // Launches the kernel (and checks for launch errors) - try { - kernel.Launch(queue, global, local, event, waitForEvents); - } catch (...) { return StatusCode::kKernelLaunchError; } + kernel.Launch(queue, global, local, event, waitForEvents); // Prints the elapsed execution time in case of debugging in verbose mode #ifdef VERBOSE @@ -66,9 +70,6 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count(); printf("[DEBUG] Completed kernel in %.2lf ms\n", timing); #endif - - // No errors, normal termination of this function - return StatusCode::kSuccess; } // ================================================================================================= diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 9d8849c3..802abec4 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -27,29 +27,29 @@ namespace clblast { // ================================================================================================= // Enqueues a kernel, waits for completion, and checks for errors -StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, - std::vector<size_t> global, const std::vector<size_t> &local, - EventPointer event, const std::vector<Event> &waitForEvents = {}); +void RunKernel(Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local, + EventPointer event, const std::vector<Event> &waitForEvents = {}); // ================================================================================================= // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able // to write to symmetric and triangular matrices through optional arguments. template <typename T> -StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, - const Database &db, - EventPointer event, const std::vector<Event> &waitForEvents, - const size_t src_one, const size_t src_two, - const size_t src_ld, const size_t src_offset, - const Buffer<T> &src, - const size_t dest_one, const size_t dest_two, - const size_t dest_ld, const size_t dest_offset, - const Buffer<T> &dest, - const T alpha, - const Program &program, const bool do_pad, - const bool do_transpose, const bool do_conjugate, - const bool upper = false, const bool lower = false, - const bool diagonal_imag_zero = false) { +void PadCopyTransposeMatrix(Queue &queue, const Device &device, + const Database &db, + EventPointer event, const std::vector<Event> &waitForEvents, + const size_t src_one, const size_t src_two, + const size_t src_ld, const size_t src_offset, + const Buffer<T> &src, + const size_t dest_one, const size_t dest_two, + const size_t dest_ld, const size_t dest_offset, + const Buffer<T> &dest, + const T alpha, + const Program &program, const bool do_pad, + const bool do_transpose, const bool do_conjugate, + const bool upper = false, const bool lower = false, + const bool diagonal_imag_zero = false) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && @@ -84,77 +84,75 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, } // Retrieves the kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program, kernel_name); - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(src_ld)); - kernel.SetArgument(1, src()); - kernel.SetArgument(2, dest()); - kernel.SetArgument(3, GetRealArg(alpha)); + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(src_ld)); + kernel.SetArgument(1, src()); + kernel.SetArgument(2, dest()); + kernel.SetArgument(3, GetRealArg(alpha)); + } + else { + kernel.SetArgument(0, static_cast<int>(src_one)); + kernel.SetArgument(1, static_cast<int>(src_two)); + kernel.SetArgument(2, static_cast<int>(src_ld)); + kernel.SetArgument(3, static_cast<int>(src_offset)); + kernel.SetArgument(4, src()); + kernel.SetArgument(5, static_cast<int>(dest_one)); + kernel.SetArgument(6, static_cast<int>(dest_two)); + kernel.SetArgument(7, static_cast<int>(dest_ld)); + kernel.SetArgument(8, static_cast<int>(dest_offset)); + kernel.SetArgument(9, dest()); + kernel.SetArgument(10, GetRealArg(alpha)); + if (do_pad) { + kernel.SetArgument(11, static_cast<int>(do_conjugate)); } else { - kernel.SetArgument(0, static_cast<int>(src_one)); - kernel.SetArgument(1, static_cast<int>(src_two)); - kernel.SetArgument(2, static_cast<int>(src_ld)); - kernel.SetArgument(3, static_cast<int>(src_offset)); - kernel.SetArgument(4, src()); - kernel.SetArgument(5, static_cast<int>(dest_one)); - kernel.SetArgument(6, static_cast<int>(dest_two)); - kernel.SetArgument(7, static_cast<int>(dest_ld)); - kernel.SetArgument(8, static_cast<int>(dest_offset)); - kernel.SetArgument(9, dest()); - kernel.SetArgument(10, GetRealArg(alpha)); - if (do_pad) { - kernel.SetArgument(11, static_cast<int>(do_conjugate)); - } - else { - kernel.SetArgument(11, static_cast<int>(upper)); - kernel.SetArgument(12, static_cast<int>(lower)); - kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero)); - } + kernel.SetArgument(11, static_cast<int>(upper)); + kernel.SetArgument(12, static_cast<int>(lower)); + kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero)); } + } - // Launches the kernel and returns the error code. Uses global and local thread sizes based on - // parameters in the database. - if (do_transpose) { - if (use_fast_kernel) { - const auto global = std::vector<size_t>{ - dest_one / db["TRA_WPT"], - dest_two / db["TRA_WPT"] - }; - const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]}; - return RunKernel(kernel, queue, device, global, local, event, waitForEvents); - } - else { - const auto global = std::vector<size_t>{ - Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), - Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]) - }; - const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]}; - return RunKernel(kernel, queue, device, global, local, event, waitForEvents); - } + // Launches the kernel and returns the error code. Uses global and local thread sizes based on + // parameters in the database. + if (do_transpose) { + if (use_fast_kernel) { + const auto global = std::vector<size_t>{ + dest_one / db["TRA_WPT"], + dest_two / db["TRA_WPT"] + }; + const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { - if (use_fast_kernel) { - const auto global = std::vector<size_t>{ - dest_one / db["COPY_VW"], - dest_two / db["COPY_WPT"] - }; - const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]}; - return RunKernel(kernel, queue, device, global, local, event, waitForEvents); - } - else { - const auto global = std::vector<size_t>{ - Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), - Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]) - }; - const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]}; - return RunKernel(kernel, queue, device, global, local, event, waitForEvents); - } + const auto global = std::vector<size_t>{ + Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), + Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]) + }; + const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); } - } catch (...) { return StatusCode::kInvalidKernel; } + } + else { + if (use_fast_kernel) { + const auto global = std::vector<size_t>{ + dest_one / db["COPY_VW"], + dest_two / db["COPY_WPT"] + }; + const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + else { + const auto global = std::vector<size_t>{ + Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), + Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]) + }; + const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + } } // ================================================================================================= diff --git a/src/routines/level1/xamax.cpp b/src/routines/level1/xamax.cpp index 6b6e7f9e..e9efa1a7 100644 --- a/src/routines/level1/xamax.cpp +++ b/src/routines/level1/xamax.cpp @@ -22,74 +22,64 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/xamax.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xamax<T>::DoAmax(const size_t n, - const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xamax<T>::DoAmax(const size_t n, + const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorIndex(1, imax_buffer, imax_offset); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorIndex(1, imax_buffer, imax_offset); // Retrieves the Xamax kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xamax"); - auto kernel2 = Kernel(program, "XamaxEpilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer1 = Buffer<T>(context_, temp_size); - auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast<int>(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast<int>(x_offset)); - kernel1.SetArgument(3, static_cast<int>(x_inc)); - kernel1.SetArgument(4, temp_buffer1()); - kernel1.SetArgument(5, temp_buffer2()); - - // Event waiting list - auto eventWaitList = std::vector<Event>(); - - // Launches the main kernel - auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; - auto local1 = std::vector<size_t>{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer1()); - kernel2.SetArgument(1, temp_buffer2()); - kernel2.SetArgument(2, imax_buffer()); - kernel2.SetArgument(3, static_cast<int>(imax_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector<size_t>{db_["WGS2"]}; - auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel1 = Kernel(program, "Xamax"); + auto kernel2 = Kernel(program, "XamaxEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer1 = Buffer<T>(context_, temp_size); + auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast<int>(x_offset)); + kernel1.SetArgument(3, static_cast<int>(x_inc)); + kernel1.SetArgument(4, temp_buffer1()); + kernel1.SetArgument(5, temp_buffer2()); + + // Event waiting list + auto eventWaitList = std::vector<Event>(); + + // Launches the main kernel + auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; + auto local1 = std::vector<size_t>{db_["WGS1"]}; + auto kernelEvent = Event(); + RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer1()); + kernel2.SetArgument(1, temp_buffer2()); + kernel2.SetArgument(2, imax_buffer()); + kernel2.SetArgument(3, static_cast<int>(imax_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector<size_t>{db_["WGS2"]}; + auto local2 = std::vector<size_t>{db_["WGS2"]}; + RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= diff --git a/src/routines/level1/xamax.hpp b/src/routines/level1/xamax.hpp index aa45a8e4..4d1e0082 100644 --- a/src/routines/level1/xamax.hpp +++ b/src/routines/level1/xamax.hpp @@ -28,9 +28,9 @@ class Xamax: public Routine { Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX"); // Templated-precision implementation of the routine - StatusCode DoAmax(const size_t n, - const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoAmax(const size_t n, + const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xasum.cpp b/src/routines/level1/xasum.cpp index 0c1ce903..a242a5fa 100644 --- a/src/routines/level1/xasum.cpp +++ b/src/routines/level1/xasum.cpp @@ -22,71 +22,61 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/xasum.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xasum<T>::DoAsum(const size_t n, - const Buffer<T> &asum_buffer, const size_t asum_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xasum<T>::DoAsum(const size_t n, + const Buffer<T> &asum_buffer, const size_t asum_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorScalar(1, asum_buffer, asum_offset); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorScalar(1, asum_buffer, asum_offset); // Retrieves the Xasum kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xasum"); - auto kernel2 = Kernel(program, "XasumEpilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer = Buffer<T>(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast<int>(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast<int>(x_offset)); - kernel1.SetArgument(3, static_cast<int>(x_inc)); - kernel1.SetArgument(4, temp_buffer()); - - // Event waiting list - auto eventWaitList = std::vector<Event>(); - - // Launches the main kernel - auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; - auto local1 = std::vector<size_t>{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer()); - kernel2.SetArgument(1, asum_buffer()); - kernel2.SetArgument(2, static_cast<int>(asum_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector<size_t>{db_["WGS2"]}; - auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel1 = Kernel(program, "Xasum"); + auto kernel2 = Kernel(program, "XasumEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer<T>(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast<int>(x_offset)); + kernel1.SetArgument(3, static_cast<int>(x_inc)); + kernel1.SetArgument(4, temp_buffer()); + + // Event waiting list + auto eventWaitList = std::vector<Event>(); + + // Launches the main kernel + auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; + auto local1 = std::vector<size_t>{db_["WGS1"]}; + auto kernelEvent = Event(); + RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, asum_buffer()); + kernel2.SetArgument(2, static_cast<int>(asum_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector<size_t>{db_["WGS2"]}; + auto local2 = std::vector<size_t>{db_["WGS2"]}; + RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= diff --git a/src/routines/level1/xasum.hpp b/src/routines/level1/xasum.hpp index 5a253f4d..0afcc4ff 100644 --- a/src/routines/level1/xasum.hpp +++ b/src/routines/level1/xasum.hpp @@ -28,9 +28,9 @@ class Xasum: public Routine { Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM"); // Templated-precision implementation of the routine - StatusCode DoAsum(const size_t n, - const Buffer<T> &asum_buffer, const size_t asum_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoAsum(const size_t n, + const Buffer<T> &asum_buffer, const size_t asum_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp index 3445e2b5..5436c5b7 100644 --- a/src/routines/level1/xaxpy.cpp +++ b/src/routines/level1/xaxpy.cpp @@ -22,29 +22,26 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xaxpy.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xaxpy<T>::DoAxpy(const size_t n, const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && @@ -55,45 +52,39 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha, auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; // Retrieves the Xaxpy kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, GetRealArg(alpha)); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, GetRealArg(alpha)); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast<int>(x_offset)); - kernel.SetArgument(4, static_cast<int>(x_inc)); - kernel.SetArgument(5, y_buffer()); - kernel.SetArgument(6, static_cast<int>(y_offset)); - kernel.SetArgument(7, static_cast<int>(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, GetRealArg(alpha)); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, GetRealArg(alpha)); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast<int>(x_offset)); + kernel.SetArgument(4, static_cast<int>(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, static_cast<int>(y_offset)); + kernel.SetArgument(7, static_cast<int>(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } } // ================================================================================================= diff --git a/src/routines/level1/xaxpy.hpp b/src/routines/level1/xaxpy.hpp index caac871e..9b30dfaa 100644 --- a/src/routines/level1/xaxpy.hpp +++ b/src/routines/level1/xaxpy.hpp @@ -28,9 +28,9 @@ class Xaxpy: public Routine { Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY"); // Templated-precision implementation of the routine - StatusCode DoAxpy(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoAxpy(const size_t n, const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xcopy.cpp b/src/routines/level1/xcopy.cpp index 673ef349..d86200c0 100644 --- a/src/routines/level1/xcopy.cpp +++ b/src/routines/level1/xcopy.cpp @@ -22,29 +22,26 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xcopy.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xcopy<T>::DoCopy(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xcopy<T>::DoCopy(const size_t n, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && @@ -55,43 +52,37 @@ StatusCode Xcopy<T>::DoCopy(const size_t n, auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy"; // Retrieves the Xcopy kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, static_cast<int>(x_offset)); - kernel.SetArgument(3, static_cast<int>(x_inc)); - kernel.SetArgument(4, y_buffer()); - kernel.SetArgument(5, static_cast<int>(y_offset)); - kernel.SetArgument(6, static_cast<int>(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, static_cast<int>(x_offset)); + kernel.SetArgument(3, static_cast<int>(x_inc)); + kernel.SetArgument(4, y_buffer()); + kernel.SetArgument(5, static_cast<int>(y_offset)); + kernel.SetArgument(6, static_cast<int>(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } } // ================================================================================================= diff --git a/src/routines/level1/xcopy.hpp b/src/routines/level1/xcopy.hpp index 0c424ba3..a6454fcc 100644 --- a/src/routines/level1/xcopy.hpp +++ b/src/routines/level1/xcopy.hpp @@ -28,9 +28,9 @@ class Xcopy: public Routine { Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY"); // Templated-precision implementation of the routine - StatusCode DoCopy(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoCopy(const size_t n, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xdot.cpp b/src/routines/level1/xdot.cpp index bafea157..9d718913 100644 --- a/src/routines/level1/xdot.cpp +++ b/src/routines/level1/xdot.cpp @@ -22,79 +22,68 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/xdot.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xdot<T>::DoDot(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const bool do_conjugate) { +void Xdot<T>::DoDot(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const bool do_conjugate) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorScalar(1, dot_buffer, dot_offset); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); + TestVectorScalar(1, dot_buffer, dot_offset); // Retrieves the Xdot kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xdot"); - auto kernel2 = Kernel(program, "XdotEpilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer = Buffer<T>(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast<int>(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast<int>(x_offset)); - kernel1.SetArgument(3, static_cast<int>(x_inc)); - kernel1.SetArgument(4, y_buffer()); - kernel1.SetArgument(5, static_cast<int>(y_offset)); - kernel1.SetArgument(6, static_cast<int>(y_inc)); - kernel1.SetArgument(7, temp_buffer()); - kernel1.SetArgument(8, static_cast<int>(do_conjugate)); - - // Event waiting list - auto eventWaitList = std::vector<Event>(); - - // Launches the main kernel - auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; - auto local1 = std::vector<size_t>{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer()); - kernel2.SetArgument(1, dot_buffer()); - kernel2.SetArgument(2, static_cast<int>(dot_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector<size_t>{db_["WGS2"]}; - auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel1 = Kernel(program, "Xdot"); + auto kernel2 = Kernel(program, "XdotEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer<T>(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast<int>(x_offset)); + kernel1.SetArgument(3, static_cast<int>(x_inc)); + kernel1.SetArgument(4, y_buffer()); + kernel1.SetArgument(5, static_cast<int>(y_offset)); + kernel1.SetArgument(6, static_cast<int>(y_inc)); + kernel1.SetArgument(7, temp_buffer()); + kernel1.SetArgument(8, static_cast<int>(do_conjugate)); + + // Event waiting list + auto eventWaitList = std::vector<Event>(); + + // Launches the main kernel + auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; + auto local1 = std::vector<size_t>{db_["WGS1"]}; + auto kernelEvent = Event(); + RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, dot_buffer()); + kernel2.SetArgument(2, static_cast<int>(dot_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector<size_t>{db_["WGS2"]}; + auto local2 = std::vector<size_t>{db_["WGS2"]}; + RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= diff --git a/src/routines/level1/xdot.hpp b/src/routines/level1/xdot.hpp index 02c1efaa..a4c9dfa0 100644 --- a/src/routines/level1/xdot.hpp +++ b/src/routines/level1/xdot.hpp @@ -28,11 +28,11 @@ class Xdot: public Routine { Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT"); // Templated-precision implementation of the routine - StatusCode DoDot(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const bool do_conjugate = false); + void DoDot(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const bool do_conjugate = false); }; // ================================================================================================= diff --git a/src/routines/level1/xdotc.cpp b/src/routines/level1/xdotc.cpp index 27cf2bab..5a4e939a 100644 --- a/src/routines/level1/xdotc.cpp +++ b/src/routines/level1/xdotc.cpp @@ -29,14 +29,14 @@ Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xdotc<T>::DoDotc(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { - return DoDot(n, dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - true); +void Xdotc<T>::DoDotc(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + DoDot(n, dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + true); } // ================================================================================================= diff --git a/src/routines/level1/xdotc.hpp b/src/routines/level1/xdotc.hpp index b8cbdaf5..ab7465f5 100644 --- a/src/routines/level1/xdotc.hpp +++ b/src/routines/level1/xdotc.hpp @@ -31,10 +31,10 @@ class Xdotc: public Xdot<T> { Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC"); // Templated-precision implementation of the routine - StatusCode DoDotc(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoDotc(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xdotu.cpp b/src/routines/level1/xdotu.cpp index 0bce70b7..b9d8bcef 100644 --- a/src/routines/level1/xdotu.cpp +++ b/src/routines/level1/xdotu.cpp @@ -28,14 +28,14 @@ Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xdotu<T>::DoDotu(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { - return DoDot(n, dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - false); +void Xdotu<T>::DoDotu(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + DoDot(n, dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + false); } // ================================================================================================= diff --git a/src/routines/level1/xdotu.hpp b/src/routines/level1/xdotu.hpp index b3f73086..cad91c58 100644 --- a/src/routines/level1/xdotu.hpp +++ b/src/routines/level1/xdotu.hpp @@ -31,10 +31,10 @@ class Xdotu: public Xdot<T> { Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU"); // Templated-precision implementation of the routine - StatusCode DoDotu(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoDotu(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xmax.hpp b/src/routines/level1/xmax.hpp index 5a0236f2..2b7a5ae7 100644 --- a/src/routines/level1/xmax.hpp +++ b/src/routines/level1/xmax.hpp @@ -35,10 +35,10 @@ class Xmax: public Xamax<T> { // Forwards to the regular absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. - StatusCode DoMax(const size_t n, - const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { - return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc); + void DoMax(const size_t n, + const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc); } }; diff --git a/src/routines/level1/xmin.hpp b/src/routines/level1/xmin.hpp index 6befec64..47a195ea 100644 --- a/src/routines/level1/xmin.hpp +++ b/src/routines/level1/xmin.hpp @@ -35,10 +35,10 @@ class Xmin: public Xamax<T> { // Forwards to the regular max-absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. - StatusCode DoMin(const size_t n, - const Buffer<unsigned int> &imin_buffer, const size_t imin_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { - return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); + void DoMin(const size_t n, + const Buffer<unsigned int> &imin_buffer, const size_t imin_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); } }; diff --git a/src/routines/level1/xnrm2.cpp b/src/routines/level1/xnrm2.cpp index 97615d8b..373820a4 100644 --- a/src/routines/level1/xnrm2.cpp +++ b/src/routines/level1/xnrm2.cpp @@ -22,71 +22,61 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/xnrm2.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xnrm2<T>::DoNrm2(const size_t n, - const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xnrm2<T>::DoNrm2(const size_t n, + const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorScalar(1, nrm2_buffer, nrm2_offset); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorScalar(1, nrm2_buffer, nrm2_offset); // Retrieves the Xnrm2 kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xnrm2"); - auto kernel2 = Kernel(program, "Xnrm2Epilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer = Buffer<T>(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast<int>(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast<int>(x_offset)); - kernel1.SetArgument(3, static_cast<int>(x_inc)); - kernel1.SetArgument(4, temp_buffer()); - - // Event waiting list - auto eventWaitList = std::vector<Event>(); - - // Launches the main kernel - auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; - auto local1 = std::vector<size_t>{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer()); - kernel2.SetArgument(1, nrm2_buffer()); - kernel2.SetArgument(2, static_cast<int>(nrm2_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector<size_t>{db_["WGS2"]}; - auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel1 = Kernel(program, "Xnrm2"); + auto kernel2 = Kernel(program, "Xnrm2Epilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer<T>(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast<int>(x_offset)); + kernel1.SetArgument(3, static_cast<int>(x_inc)); + kernel1.SetArgument(4, temp_buffer()); + + // Event waiting list + auto eventWaitList = std::vector<Event>(); + + // Launches the main kernel + auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; + auto local1 = std::vector<size_t>{db_["WGS1"]}; + auto kernelEvent = Event(); + RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, nrm2_buffer()); + kernel2.SetArgument(2, static_cast<int>(nrm2_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector<size_t>{db_["WGS2"]}; + auto local2 = std::vector<size_t>{db_["WGS2"]}; + RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= diff --git a/src/routines/level1/xnrm2.hpp b/src/routines/level1/xnrm2.hpp index 7baf07f5..3183ce24 100644 --- a/src/routines/level1/xnrm2.hpp +++ b/src/routines/level1/xnrm2.hpp @@ -28,9 +28,9 @@ class Xnrm2: public Routine { Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2"); // Templated-precision implementation of the routine - StatusCode DoNrm2(const size_t n, - const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoNrm2(const size_t n, + const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp index bcc43c3b..17410f01 100644 --- a/src/routines/level1/xscal.cpp +++ b/src/routines/level1/xscal.cpp @@ -22,26 +22,24 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xscal.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xscal<T>::DoScal(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xscal<T>::DoScal(const size_t n, const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vector for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && @@ -51,41 +49,35 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha, auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal"; // Retrieves the Xscal kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, alpha); - kernel.SetArgument(2, x_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, alpha); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast<int>(x_offset)); - kernel.SetArgument(4, static_cast<int>(x_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast<int>(x_offset)); + kernel.SetArgument(4, static_cast<int>(x_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } } // ================================================================================================= diff --git a/src/routines/level1/xscal.hpp b/src/routines/level1/xscal.hpp index 6c585cb2..02c847cc 100644 --- a/src/routines/level1/xscal.hpp +++ b/src/routines/level1/xscal.hpp @@ -28,8 +28,8 @@ class Xscal: public Routine { Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL"); // Templated-precision implementation of the routine - StatusCode DoScal(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoScal(const size_t n, const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xsum.hpp b/src/routines/level1/xsum.hpp index 84e20bea..a69d6511 100644 --- a/src/routines/level1/xsum.hpp +++ b/src/routines/level1/xsum.hpp @@ -35,10 +35,10 @@ class Xsum: public Xasum<T> { // Forwards to the regular absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. - StatusCode DoSum(const size_t n, - const Buffer<T> &sum_buffer, const size_t sum_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { - return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc); + void DoSum(const size_t n, + const Buffer<T> &sum_buffer, const size_t sum_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc); } }; diff --git a/src/routines/level1/xswap.cpp b/src/routines/level1/xswap.cpp index 03907cbd..c9b97dc9 100644 --- a/src/routines/level1/xswap.cpp +++ b/src/routines/level1/xswap.cpp @@ -22,29 +22,26 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xswap.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xswap<T>::DoSwap(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xswap<T>::DoSwap(const size_t n, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && @@ -55,43 +52,37 @@ StatusCode Xswap<T>::DoSwap(const size_t n, auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap"; // Retrieves the Xswap kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, static_cast<int>(x_offset)); - kernel.SetArgument(3, static_cast<int>(x_inc)); - kernel.SetArgument(4, y_buffer()); - kernel.SetArgument(5, static_cast<int>(y_offset)); - kernel.SetArgument(6, static_cast<int>(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, static_cast<int>(x_offset)); + kernel.SetArgument(3, static_cast<int>(x_inc)); + kernel.SetArgument(4, y_buffer()); + kernel.SetArgument(5, static_cast<int>(y_offset)); + kernel.SetArgument(6, static_cast<int>(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } } // ================================================================================================= diff --git a/src/routines/level1/xswap.hpp b/src/routines/level1/xswap.hpp index 4f9ea36d..eadf58e5 100644 --- a/src/routines/level1/xswap.hpp +++ b/src/routines/level1/xswap.hpp @@ -28,9 +28,9 @@ class Xswap: public Routine { Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP"); // Templated-precision implementation of the routine - StatusCode DoSwap(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoSwap(const size_t n, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xgbmv.cpp b/src/routines/level2/xgbmv.cpp index ea4f001c..e80b9a96 100644 --- a/src/routines/level2/xgbmv.cpp +++ b/src/routines/level2/xgbmv.cpp @@ -29,13 +29,13 @@ Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Reverses the upper and lower band count auto rotated = (layout == Layout::kRowMajor); @@ -46,13 +46,13 @@ StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose, // The specific hermitian matrix-accesses are implemented in the kernel guarded by the // ROUTINE_GBMV define. bool fast_kernels = false; - return MatVec(layout, a_transpose, - m, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - 0, false, kl_real, ku_real); + MatVec(layout, a_transpose, + m, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + 0, false, kl_real, ku_real); } // ================================================================================================= diff --git a/src/routines/level2/xgbmv.hpp b/src/routines/level2/xgbmv.hpp index 686ab642..e5f670ec 100644 --- a/src/routines/level2/xgbmv.hpp +++ b/src/routines/level2/xgbmv.hpp @@ -33,13 +33,13 @@ class Xgbmv: public Xgemv<T> { Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV"); // Templated-precision implementation of the routine - StatusCode DoGbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index 4e32ba41..7b4c2e8f 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -22,52 +22,51 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, { #include "../../kernels/level2/xgemv.opencl" #include "../../kernels/level2/xgemv_fast.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Performs the matrix-vector multiplication - return MatVec(layout, a_transpose, - m, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - true, true, - 0, false, 0, 0); // N/A for this routine + MatVec(layout, a_transpose, + m, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + true, true, + 0, false, 0, 0); // N/A for this routine } // ================================================================================================= // The generic implementation, also suited for other (non general) matrix-vector multiplications template <typename T> -StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - bool fast_kernel, bool fast_kernel_rot, - const size_t parameter, const bool packed, - const size_t kl, const size_t ku) { +void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + bool fast_kernel, bool fast_kernel_rot, + const size_t parameter, const bool packed, + const size_t kl, const size_t ku) { // Makes sure all dimensions are larger than zero - if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } + if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrix has an alternative layout (row or column-major). auto a_altlayout = (layout == Layout::kRowMajor); @@ -91,14 +90,10 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, auto a_conjugate = (a_transpose == Transpose::kConjugate); // Tests the matrix and the vectors for validity - auto status = StatusCode::kSuccess; - if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } - else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); } - if (ErrorIn(status)) { return status; } - status = TestVectorX(n_real, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(m_real, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + if (packed) { TestMatrixAP(n, a_buffer, a_offset); } + else { TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); } + TestVectorX(n_real, x_buffer, x_offset, x_inc); + TestVectorY(m_real, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && @@ -127,39 +122,33 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, } // Retrieves the Xgemv kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(m_real)); - kernel.SetArgument(1, static_cast<int>(n_real)); - kernel.SetArgument(2, GetRealArg(alpha)); - kernel.SetArgument(3, GetRealArg(beta)); - kernel.SetArgument(4, static_cast<int>(a_rotated)); - kernel.SetArgument(5, a_buffer()); - kernel.SetArgument(6, static_cast<int>(a_offset)); - kernel.SetArgument(7, static_cast<int>(a_ld)); - kernel.SetArgument(8, x_buffer()); - kernel.SetArgument(9, static_cast<int>(x_offset)); - kernel.SetArgument(10, static_cast<int>(x_inc)); - kernel.SetArgument(11, y_buffer()); - kernel.SetArgument(12, static_cast<int>(y_offset)); - kernel.SetArgument(13, static_cast<int>(y_inc)); - kernel.SetArgument(14, static_cast<int>(a_conjugate)); - kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm - kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices - kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices - - // Launches the kernel - auto global = std::vector<size_t>{global_size}; - auto local = std::vector<size_t>{local_size}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(m_real)); + kernel.SetArgument(1, static_cast<int>(n_real)); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); + kernel.SetArgument(4, static_cast<int>(a_rotated)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast<int>(a_offset)); + kernel.SetArgument(7, static_cast<int>(a_ld)); + kernel.SetArgument(8, x_buffer()); + kernel.SetArgument(9, static_cast<int>(x_offset)); + kernel.SetArgument(10, static_cast<int>(x_inc)); + kernel.SetArgument(11, y_buffer()); + kernel.SetArgument(12, static_cast<int>(y_offset)); + kernel.SetArgument(13, static_cast<int>(y_inc)); + kernel.SetArgument(14, static_cast<int>(a_conjugate)); + kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm + kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices + kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices + + // Launches the kernel + auto global = std::vector<size_t>{global_size}; + auto local = std::vector<size_t>{local_size}; + RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= diff --git a/src/routines/level2/xgemv.hpp b/src/routines/level2/xgemv.hpp index e9afec8d..1e1fa726 100644 --- a/src/routines/level2/xgemv.hpp +++ b/src/routines/level2/xgemv.hpp @@ -28,25 +28,25 @@ class Xgemv: public Routine { Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV"); // Templated-precision implementation of the routine - StatusCode DoGemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoGemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); // Generic version used also for other matrix-vector multiplications - StatusCode MatVec(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - bool fast_kernel, bool fast_kernel_rot, - const size_t parameter, const bool packed, - const size_t kl, const size_t ku); + void MatVec(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + bool fast_kernel, bool fast_kernel_rot, + const size_t parameter, const bool packed, + const size_t kl, const size_t ku); }; // ================================================================================================= diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp index 29cffe0c..d16ebd11 100644 --- a/src/routines/level2/xger.cpp +++ b/src/routines/level2/xger.cpp @@ -22,26 +22,25 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, { #include "../../kernels/level2/level2.opencl" #include "../../kernels/level2/xger.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xger<T>::DoGer(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { +void Xger<T>::DoGer(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { // Makes sure all dimensions are larger than zero - if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } + if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrix has an alternative layout (row or column-major). const auto a_is_rowmajor = (layout == Layout::kRowMajor); @@ -49,44 +48,35 @@ StatusCode Xger<T>::DoGer(const Layout layout, const auto a_two = (a_is_rowmajor) ? m : n; // Tests the matrix and the vectors for validity - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestVectorX(m, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + TestVectorX(m, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Retrieves the kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xger"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(a_one)); - kernel.SetArgument(1, static_cast<int>(a_two)); - kernel.SetArgument(2, GetRealArg(alpha)); - kernel.SetArgument(3, x_buffer()); - kernel.SetArgument(4, static_cast<int>(x_offset)); - kernel.SetArgument(5, static_cast<int>(x_inc)); - kernel.SetArgument(6, y_buffer()); - kernel.SetArgument(7, static_cast<int>(y_offset)); - kernel.SetArgument(8, static_cast<int>(y_inc)); - kernel.SetArgument(9, a_buffer()); - kernel.SetArgument(10, static_cast<int>(a_offset)); - kernel.SetArgument(11, static_cast<int>(a_ld)); - kernel.SetArgument(12, static_cast<int>(a_is_rowmajor)); - - // Launches the kernel - auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]); - auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]); - auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled}; - auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, "Xger"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(a_one)); + kernel.SetArgument(1, static_cast<int>(a_two)); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, x_buffer()); + kernel.SetArgument(4, static_cast<int>(x_offset)); + kernel.SetArgument(5, static_cast<int>(x_inc)); + kernel.SetArgument(6, y_buffer()); + kernel.SetArgument(7, static_cast<int>(y_offset)); + kernel.SetArgument(8, static_cast<int>(y_inc)); + kernel.SetArgument(9, a_buffer()); + kernel.SetArgument(10, static_cast<int>(a_offset)); + kernel.SetArgument(11, static_cast<int>(a_ld)); + kernel.SetArgument(12, static_cast<int>(a_is_rowmajor)); + + // Launches the kernel + auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]); + auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]); + auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled}; + auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; + RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= diff --git a/src/routines/level2/xger.hpp b/src/routines/level2/xger.hpp index 3c6abe44..fbbb07a1 100644 --- a/src/routines/level2/xger.hpp +++ b/src/routines/level2/xger.hpp @@ -28,12 +28,12 @@ class Xger: public Routine { Xger(Queue &queue, EventPointer event, const std::string &name = "GER"); // Templated-precision implementation of the routine - StatusCode DoGer(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); + void DoGer(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= diff --git a/src/routines/level2/xgerc.cpp b/src/routines/level2/xgerc.cpp index d9feda97..4fa2e2a8 100644 --- a/src/routines/level2/xgerc.cpp +++ b/src/routines/level2/xgerc.cpp @@ -28,19 +28,19 @@ Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xgerc<T>::DoGerc(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { +void Xgerc<T>::DoGerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { // Regular Ger operation on complex data, plus conjugation in the kernel guarded by the // ROUTINE_GERC guard. - return DoGer(layout, m, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld); + DoGer(layout, m, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld); } // ================================================================================================= diff --git a/src/routines/level2/xgerc.hpp b/src/routines/level2/xgerc.hpp index f1d04dfd..2d61f2b7 100644 --- a/src/routines/level2/xgerc.hpp +++ b/src/routines/level2/xgerc.hpp @@ -31,12 +31,12 @@ class Xgerc: public Xger<T> { Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC"); // Templated-precision implementation of the routine - StatusCode DoGerc(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); + void DoGerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= diff --git a/src/routines/level2/xgeru.cpp b/src/routines/level2/xgeru.cpp index da9e91c2..c77e69c5 100644 --- a/src/routines/level2/xgeru.cpp +++ b/src/routines/level2/xgeru.cpp @@ -28,18 +28,18 @@ Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xgeru<T>::DoGeru(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { +void Xgeru<T>::DoGeru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { // Regular Ger operation on complex data - return DoGer(layout, m, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld); + DoGer(layout, m, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld); } // ================================================================================================= diff --git a/src/routines/level2/xgeru.hpp b/src/routines/level2/xgeru.hpp index fb50e917..4cae6b58 100644 --- a/src/routines/level2/xgeru.hpp +++ b/src/routines/level2/xgeru.hpp @@ -31,12 +31,12 @@ class Xgeru: public Xger<T> { Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU"); // Templated-precision implementation of the routine - StatusCode DoGeru(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); + void DoGeru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= diff --git a/src/routines/level2/xhbmv.cpp b/src/routines/level2/xhbmv.cpp index f6c0e3c4..c7c9ed9d 100644 --- a/src/routines/level2/xhbmv.cpp +++ b/src/routines/level2/xhbmv.cpp @@ -29,13 +29,13 @@ Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -45,13 +45,13 @@ StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle, // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the // ROUTINE_HBMV define. bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, false, k, 0); + MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, k, 0); } // ================================================================================================= diff --git a/src/routines/level2/xhbmv.hpp b/src/routines/level2/xhbmv.hpp index d668eb88..76d3c91e 100644 --- a/src/routines/level2/xhbmv.hpp +++ b/src/routines/level2/xhbmv.hpp @@ -33,13 +33,13 @@ class Xhbmv: public Xgemv<T> { Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV"); // Templated-precision implementation of the routine - StatusCode DoHbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoHbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xhemv.cpp b/src/routines/level2/xhemv.cpp index 2cbcf7b4..209ff654 100644 --- a/src/routines/level2/xhemv.cpp +++ b/src/routines/level2/xhemv.cpp @@ -29,13 +29,13 @@ Xhemv<T>::Xhemv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -45,13 +45,13 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle, // The specific hermitian matrix-accesses are implemented in the kernel guarded by the // ROUTINE_HEMV define. bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, false, 0, 0); + MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xhemv.hpp b/src/routines/level2/xhemv.hpp index 8e062fd3..20d2df22 100644 --- a/src/routines/level2/xhemv.hpp +++ b/src/routines/level2/xhemv.hpp @@ -33,13 +33,13 @@ class Xhemv: public Xgemv<T> { Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV"); // Templated-precision implementation of the routine - StatusCode DoHemv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoHemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp index 6dd95938..6c334e63 100644 --- a/src/routines/level2/xher.cpp +++ b/src/routines/level2/xher.cpp @@ -21,11 +21,10 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T, typename U> Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, { #include "../../kernels/level2/level2.opencl" #include "../../kernels/level2/xher.opencl" - ; + }) { } // ================================================================================================= @@ -41,15 +40,15 @@ template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; } // The main routine template <typename T, typename U> -StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, - const size_t n, - const U alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const bool packed) { +void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, + const size_t n, + const U alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const bool packed) { // Makes sure the dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // The data is either in the upper or lower triangle const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -57,47 +56,38 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, const auto is_rowmajor = (layout == Layout::kRowMajor); // Tests the matrix and the vectors for validity - auto status = StatusCode::kSuccess; - if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } - else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); } - if (ErrorIn(status)) { return status; } - status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } + if (packed) { TestMatrixAP(n, a_buffer, a_offset); } + else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); } + TestVectorX(n, x_buffer, x_offset, x_inc); // If alpha is zero an update is not required - if (alpha == U{0}) { return StatusCode::kSuccess; } + if (alpha == U{0}) { return; } // Creates a matching version of alpha const auto matching_alpha = GetAlpha(alpha); // Retrieves the kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xher"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, GetRealArg(matching_alpha)); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast<int>(x_offset)); - kernel.SetArgument(4, static_cast<int>(x_inc)); - kernel.SetArgument(5, a_buffer()); - kernel.SetArgument(6, static_cast<int>(a_offset)); - kernel.SetArgument(7, static_cast<int>(a_ld)); - kernel.SetArgument(8, static_cast<int>(is_upper)); - kernel.SetArgument(9, static_cast<int>(is_rowmajor)); - - // Launches the kernel - auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); - auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); - auto global = std::vector<size_t>{global_one, global_two}; - auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, "Xher"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, GetRealArg(matching_alpha)); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast<int>(x_offset)); + kernel.SetArgument(4, static_cast<int>(x_inc)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast<int>(a_offset)); + kernel.SetArgument(7, static_cast<int>(a_ld)); + kernel.SetArgument(8, static_cast<int>(is_upper)); + kernel.SetArgument(9, static_cast<int>(is_rowmajor)); + + // Launches the kernel + auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); + auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); + auto global = std::vector<size_t>{global_one, global_two}; + auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; + RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= diff --git a/src/routines/level2/xher.hpp b/src/routines/level2/xher.hpp index 9ff6bf3f..70a30bda 100644 --- a/src/routines/level2/xher.hpp +++ b/src/routines/level2/xher.hpp @@ -31,12 +31,12 @@ class Xher: public Routine { T GetAlpha(const U alpha); // Templated-precision implementation of the routine - StatusCode DoHer(const Layout layout, const Triangle triangle, - const size_t n, - const U alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const bool packed = false); + void DoHer(const Layout layout, const Triangle triangle, + const size_t n, + const U alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const bool packed = false); }; // ================================================================================================= diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp index 3d57a9b9..11e2c871 100644 --- a/src/routines/level2/xher2.cpp +++ b/src/routines/level2/xher2.cpp @@ -21,27 +21,26 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, { #include "../../kernels/level2/level2.opencl" #include "../../kernels/level2/xher2.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const bool packed) { +void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const bool packed) { // Makes sure the dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // The data is either in the upper or lower triangle const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -49,46 +48,36 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, const auto is_rowmajor = (layout == Layout::kRowMajor); // Tests the matrix and the vectors for validity - auto status = StatusCode::kSuccess; - if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } - else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); } - if (ErrorIn(status)) { return status; } - status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + if (packed) { TestMatrixAP(n, a_buffer, a_offset); } + else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Retrieves the kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xher2"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, GetRealArg(alpha)); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast<int>(x_offset)); - kernel.SetArgument(4, static_cast<int>(x_inc)); - kernel.SetArgument(5, y_buffer()); - kernel.SetArgument(6, static_cast<int>(y_offset)); - kernel.SetArgument(7, static_cast<int>(y_inc)); - kernel.SetArgument(8, a_buffer()); - kernel.SetArgument(9, static_cast<int>(a_offset)); - kernel.SetArgument(10, static_cast<int>(a_ld)); - kernel.SetArgument(11, static_cast<int>(is_upper)); - kernel.SetArgument(12, static_cast<int>(is_rowmajor)); - - // Launches the kernel - auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); - auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); - auto global = std::vector<size_t>{global_one, global_two}; - auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, "Xher2"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, GetRealArg(alpha)); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast<int>(x_offset)); + kernel.SetArgument(4, static_cast<int>(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, static_cast<int>(y_offset)); + kernel.SetArgument(7, static_cast<int>(y_inc)); + kernel.SetArgument(8, a_buffer()); + kernel.SetArgument(9, static_cast<int>(a_offset)); + kernel.SetArgument(10, static_cast<int>(a_ld)); + kernel.SetArgument(11, static_cast<int>(is_upper)); + kernel.SetArgument(12, static_cast<int>(is_rowmajor)); + + // Launches the kernel + auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); + auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); + auto global = std::vector<size_t>{global_one, global_two}; + auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; + RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= diff --git a/src/routines/level2/xher2.hpp b/src/routines/level2/xher2.hpp index 8c53c047..dcb2ecb7 100644 --- a/src/routines/level2/xher2.hpp +++ b/src/routines/level2/xher2.hpp @@ -28,13 +28,13 @@ class Xher2: public Routine { Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2"); // Templated-precision implementation of the routine - StatusCode DoHer2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const bool packed = false); + void DoHer2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const bool packed = false); }; // ================================================================================================= diff --git a/src/routines/level2/xhpmv.cpp b/src/routines/level2/xhpmv.cpp index e6f82b34..70a0ab0d 100644 --- a/src/routines/level2/xhpmv.cpp +++ b/src/routines/level2/xhpmv.cpp @@ -29,13 +29,13 @@ Xhpmv<T>::Xhpmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -45,13 +45,13 @@ StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle, // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the // ROUTINE_HPMV define. bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - ap_buffer, ap_offset, n, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, true, 0, 0); + MatVec(layout, Transpose::kNo, + n, n, alpha, + ap_buffer, ap_offset, n, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, true, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xhpmv.hpp b/src/routines/level2/xhpmv.hpp index b11192f9..13a6277c 100644 --- a/src/routines/level2/xhpmv.hpp +++ b/src/routines/level2/xhpmv.hpp @@ -33,13 +33,13 @@ class Xhpmv: public Xgemv<T> { Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV"); // Templated-precision implementation of the routine - StatusCode DoHpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoHpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xhpr.cpp b/src/routines/level2/xhpr.cpp index 225ebfe5..7e517c59 100644 --- a/src/routines/level2/xhpr.cpp +++ b/src/routines/level2/xhpr.cpp @@ -28,17 +28,17 @@ Xhpr<T,U>::Xhpr(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T, typename U> -StatusCode Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle, - const size_t n, - const U alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset) { +void Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle, + const size_t n, + const U alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset) { // Specific Xhpr functionality is implemented in the kernel using defines - return DoHer(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, n, - true); // packed matrix + DoHer(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, n, + true); // packed matrix } // ================================================================================================= diff --git a/src/routines/level2/xhpr.hpp b/src/routines/level2/xhpr.hpp index 37801c68..6ebc220e 100644 --- a/src/routines/level2/xhpr.hpp +++ b/src/routines/level2/xhpr.hpp @@ -31,11 +31,11 @@ class Xhpr: public Xher<T,U> { Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR"); // Templated-precision implementation of the routine - StatusCode DoHpr(const Layout layout, const Triangle triangle, - const size_t n, - const U alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset); + void DoHpr(const Layout layout, const Triangle triangle, + const size_t n, + const U alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset); }; // ================================================================================================= diff --git a/src/routines/level2/xhpr2.cpp b/src/routines/level2/xhpr2.cpp index 85f9d3f9..35daa365 100644 --- a/src/routines/level2/xhpr2.cpp +++ b/src/routines/level2/xhpr2.cpp @@ -28,19 +28,19 @@ Xhpr2<T>::Xhpr2(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset) { +void Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset) { // Specific Xhpr2 functionality is implemented in the kernel using defines - return DoHer2(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, n, - true); // packed matrix + DoHer2(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, n, + true); // packed matrix } // ================================================================================================= diff --git a/src/routines/level2/xhpr2.hpp b/src/routines/level2/xhpr2.hpp index d66dce55..f344fd48 100644 --- a/src/routines/level2/xhpr2.hpp +++ b/src/routines/level2/xhpr2.hpp @@ -31,12 +31,12 @@ class Xhpr2: public Xher2<T> { Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2"); // Templated-precision implementation of the routine - StatusCode DoHpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset); + void DoHpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset); }; // ================================================================================================= diff --git a/src/routines/level2/xsbmv.cpp b/src/routines/level2/xsbmv.cpp index 28730899..e47430d1 100644 --- a/src/routines/level2/xsbmv.cpp +++ b/src/routines/level2/xsbmv.cpp @@ -29,13 +29,13 @@ Xsbmv<T>::Xsbmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -45,13 +45,13 @@ StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle, // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the // ROUTINE_SBMV define. bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, false, k, 0); + MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, k, 0); } // ================================================================================================= diff --git a/src/routines/level2/xsbmv.hpp b/src/routines/level2/xsbmv.hpp index 16c5e9a8..a4542f49 100644 --- a/src/routines/level2/xsbmv.hpp +++ b/src/routines/level2/xsbmv.hpp @@ -33,13 +33,13 @@ class Xsbmv: public Xgemv<T> { Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV"); // Templated-precision implementation of the routine - StatusCode DoSbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoSbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xspmv.cpp b/src/routines/level2/xspmv.cpp index f6651012..bf1a49e1 100644 --- a/src/routines/level2/xspmv.cpp +++ b/src/routines/level2/xspmv.cpp @@ -29,13 +29,13 @@ Xspmv<T>::Xspmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -45,13 +45,13 @@ StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle, // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the // ROUTINE_SPMV define. bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - ap_buffer, ap_offset, n, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, true, 0, 0); + MatVec(layout, Transpose::kNo, + n, n, alpha, + ap_buffer, ap_offset, n, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, true, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xspmv.hpp b/src/routines/level2/xspmv.hpp index a0c69b85..94caa4ac 100644 --- a/src/routines/level2/xspmv.hpp +++ b/src/routines/level2/xspmv.hpp @@ -33,13 +33,13 @@ class Xspmv: public Xgemv<T> { Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV"); // Templated-precision implementation of the routine - StatusCode DoSpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoSpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xspr.cpp b/src/routines/level2/xspr.cpp index a75fe9c3..56791a7b 100644 --- a/src/routines/level2/xspr.cpp +++ b/src/routines/level2/xspr.cpp @@ -28,17 +28,17 @@ Xspr<T>::Xspr(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset) { +void Xspr<T>::DoSpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset) { // Specific Xspr functionality is implemented in the kernel using defines - return DoHer(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, n, - true); // packed matrix + DoHer(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, n, + true); // packed matrix } // ================================================================================================= diff --git a/src/routines/level2/xspr.hpp b/src/routines/level2/xspr.hpp index 6468c736..760a2ddb 100644 --- a/src/routines/level2/xspr.hpp +++ b/src/routines/level2/xspr.hpp @@ -31,11 +31,11 @@ class Xspr: public Xher<T,T> { Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR"); // Templated-precision implementation of the routine - StatusCode DoSpr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset); + void DoSpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset); }; // ================================================================================================= diff --git a/src/routines/level2/xspr2.cpp b/src/routines/level2/xspr2.cpp index c39a2eb4..8d0432c2 100644 --- a/src/routines/level2/xspr2.cpp +++ b/src/routines/level2/xspr2.cpp @@ -28,19 +28,19 @@ Xspr2<T>::Xspr2(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset) { +void Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset) { // Specific Xspr2 functionality is implemented in the kernel using defines - return DoHer2(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, n, - true); // packed matrix + DoHer2(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, n, + true); // packed matrix } // ================================================================================================= diff --git a/src/routines/level2/xspr2.hpp b/src/routines/level2/xspr2.hpp index 693c56a1..9f03f768 100644 --- a/src/routines/level2/xspr2.hpp +++ b/src/routines/level2/xspr2.hpp @@ -31,12 +31,12 @@ class Xspr2: public Xher2<T> { Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2"); // Templated-precision implementation of the routine - StatusCode DoSpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset); + void DoSpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset); }; // ================================================================================================= diff --git a/src/routines/level2/xsymv.cpp b/src/routines/level2/xsymv.cpp index 648d2a3e..86bb66b8 100644 --- a/src/routines/level2/xsymv.cpp +++ b/src/routines/level2/xsymv.cpp @@ -29,13 +29,13 @@ Xsymv<T>::Xsymv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -45,13 +45,13 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle, // The specific symmetric matrix-accesses are implemented in the kernel guarded by the // ROUTINE_SYMV define. bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, false, 0, 0); + MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xsymv.hpp b/src/routines/level2/xsymv.hpp index 67815f2f..3945802f 100644 --- a/src/routines/level2/xsymv.hpp +++ b/src/routines/level2/xsymv.hpp @@ -33,13 +33,13 @@ class Xsymv: public Xgemv<T> { Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV"); // Templated-precision implementation of the routine - StatusCode DoSymv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoSymv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xsyr.cpp b/src/routines/level2/xsyr.cpp index 758d8f8f..64c2dc74 100644 --- a/src/routines/level2/xsyr.cpp +++ b/src/routines/level2/xsyr.cpp @@ -28,16 +28,16 @@ Xsyr<T>::Xsyr(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { +void Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { // Specific Xsyr functionality is implemented in the kernel using defines - return DoHer(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld); + DoHer(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld); } // ================================================================================================= diff --git a/src/routines/level2/xsyr.hpp b/src/routines/level2/xsyr.hpp index 20393454..a23ff80f 100644 --- a/src/routines/level2/xsyr.hpp +++ b/src/routines/level2/xsyr.hpp @@ -31,11 +31,11 @@ class Xsyr: public Xher<T,T> { Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR"); // Templated-precision implementation of the routine - StatusCode DoSyr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); + void DoSyr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= diff --git a/src/routines/level2/xsyr2.cpp b/src/routines/level2/xsyr2.cpp index 6f43b219..38ca9d69 100644 --- a/src/routines/level2/xsyr2.cpp +++ b/src/routines/level2/xsyr2.cpp @@ -28,18 +28,18 @@ Xsyr2<T>::Xsyr2(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { +void Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { // Specific Xsyr2 functionality is implemented in the kernel using defines - return DoHer2(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld); + DoHer2(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld); } // ================================================================================================= diff --git a/src/routines/level2/xsyr2.hpp b/src/routines/level2/xsyr2.hpp index 1a8dcbe8..5a8d8eb4 100644 --- a/src/routines/level2/xsyr2.hpp +++ b/src/routines/level2/xsyr2.hpp @@ -31,12 +31,12 @@ class Xsyr2: public Xher2<T> { Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2"); // Templated-precision implementation of the routine - StatusCode DoSyr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); + void DoSyr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= diff --git a/src/routines/level2/xtbmv.cpp b/src/routines/level2/xtbmv.cpp index e315c544..f4a58ed2 100644 --- a/src/routines/level2/xtbmv.cpp +++ b/src/routines/level2/xtbmv.cpp @@ -29,17 +29,15 @@ Xtbmv<T>::Xtbmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Creates a copy of X: a temporary scratch buffer auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); - try { - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); - } catch (...) { } // Continues: error-code is returned in MatVec + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -52,20 +50,22 @@ StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle, // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the // ROUTINE_TBMV define. auto fast_kernels = false; - auto status = MatVec(layout, a_transpose, - n, n, static_cast<T>(1), - a_buffer, a_offset, a_ld, - scratch_buffer, x_offset, x_inc, static_cast<T>(0), - x_buffer, x_offset, x_inc, - fast_kernels, fast_kernels, - parameter, false, k, 0); - - // Returns the proper error code (renames vector Y to X) - switch(status) { - case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; - case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; - case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; - default: return status; + try { + MatVec(layout, a_transpose, + n, n, static_cast<T>(1), + a_buffer, a_offset, a_ld, + scratch_buffer, x_offset, x_inc, static_cast<T>(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, false, k, 0); + } catch (BLASError &e) { + // Returns the proper error code (renames vector Y to X) + switch (e.status()) { + case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details()); + case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details()); + case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details()); + default: throw; + } } } diff --git a/src/routines/level2/xtbmv.hpp b/src/routines/level2/xtbmv.hpp index 389e9705..abd12db6 100644 --- a/src/routines/level2/xtbmv.hpp +++ b/src/routines/level2/xtbmv.hpp @@ -35,11 +35,11 @@ class Xtbmv: public Xgemv<T> { Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV"); // Templated-precision implementation of the routine - StatusCode DoTbmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoTbmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xtpmv.cpp b/src/routines/level2/xtpmv.cpp index 46811089..c0d26699 100644 --- a/src/routines/level2/xtpmv.cpp +++ b/src/routines/level2/xtpmv.cpp @@ -29,17 +29,15 @@ Xtpmv<T>::Xtpmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Creates a copy of X: a temporary scratch buffer auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); - try { - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); - } catch (...) { } // Continues: error-code is returned in MatVec + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -52,20 +50,22 @@ StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle, // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the // ROUTINE_TPMV define. auto fast_kernels = false; - auto status = MatVec(layout, a_transpose, - n, n, static_cast<T>(1), - ap_buffer, ap_offset, n, - scratch_buffer, x_offset, x_inc, static_cast<T>(0), - x_buffer, x_offset, x_inc, - fast_kernels, fast_kernels, - parameter, true, 0, 0); - - // Returns the proper error code (renames vector Y to X) - switch(status) { - case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; - case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; - case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; - default: return status; + try { + MatVec(layout, a_transpose, + n, n, static_cast<T>(1), + ap_buffer, ap_offset, n, + scratch_buffer, x_offset, x_inc, static_cast<T>(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, true, 0, 0); + } catch (BLASError &e) { + // Returns the proper error code (renames vector Y to X) + switch (e.status()) { + case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details()); + case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details()); + case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details()); + default: throw; + } } } diff --git a/src/routines/level2/xtpmv.hpp b/src/routines/level2/xtpmv.hpp index 0e8cf1d2..5b3954e8 100644 --- a/src/routines/level2/xtpmv.hpp +++ b/src/routines/level2/xtpmv.hpp @@ -35,11 +35,11 @@ class Xtpmv: public Xgemv<T> { Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV"); // Templated-precision implementation of the routine - StatusCode DoTpmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoTpmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xtrmv.cpp b/src/routines/level2/xtrmv.cpp index d2f24252..5fff9b31 100644 --- a/src/routines/level2/xtrmv.cpp +++ b/src/routines/level2/xtrmv.cpp @@ -29,17 +29,15 @@ Xtrmv<T>::Xtrmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Creates a copy of X: a temporary scratch buffer auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); - try { - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); - } catch (...) { } // Continues: error-code is returned in MatVec + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -52,20 +50,22 @@ StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle, // The specific triangular matrix-accesses are implemented in the kernel guarded by the // ROUTINE_TRMV define. auto fast_kernels = false; - auto status = MatVec(layout, a_transpose, - n, n, static_cast<T>(1), - a_buffer, a_offset, a_ld, - scratch_buffer, x_offset, x_inc, static_cast<T>(0), - x_buffer, x_offset, x_inc, - fast_kernels, fast_kernels, - parameter, false, 0, 0); - - // Returns the proper error code (renames vector Y to X) - switch(status) { - case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; - case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; - case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; - default: return status; + try { + MatVec(layout, a_transpose, + n, n, static_cast<T>(1), + a_buffer, a_offset, a_ld, + scratch_buffer, x_offset, x_inc, static_cast<T>(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, false, 0, 0); + } catch (BLASError &e) { + // Returns the proper error code (renames vector Y to X) + switch (e.status()) { + case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details()); + case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details()); + case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details()); + default: throw; + } } } diff --git a/src/routines/level2/xtrmv.hpp b/src/routines/level2/xtrmv.hpp index 07dd7841..b028ee68 100644 --- a/src/routines/level2/xtrmv.hpp +++ b/src/routines/level2/xtrmv.hpp @@ -35,11 +35,11 @@ class Xtrmv: public Xgemv<T> { Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV"); // Templated-precision implementation of the routine - StatusCode DoTrmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoTrmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 1602c69f..4f70dc7a 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -24,8 +24,7 @@ template <typename T> Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"}, - PrecisionValue<T>()) { - source_string_ = + PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" @@ -37,30 +36,28 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_direct_part1.opencl" #include "../../kernels/level3/xgemm_direct_part2.opencl" #include "../../kernels/level3/xgemm_direct_part3.opencl" - ; - auto source_string_part_2 = // separated in two parts to prevent C1091 in MSVC 2013 + , // separated in two parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" - ; - source_string_ += source_string_part_2; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xgemm<T>::DoGemm(const Layout layout, - const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { +void Xgemm<T>::DoGemm(const Layout layout, + const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; } + if ((m == 0) || (n == 0) || (k == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrices are transposed in memory. This is based on their layout // (row or column-major) and whether or not they are requested to be pre-transposed. Note @@ -99,12 +96,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // matrix A cannot be less than K when rotated, or less than M when not-rotated // matrix B cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N when rotated, or less than M when not-rotated - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); + TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld); // Selects which version of GEMM to run const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]); @@ -131,7 +125,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // requirements, but several pre and post-processing kernels take care of those. However, the // overhead of these extra kernels might not be ideal for certain devices/arguments. template <typename T> -StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, +void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, @@ -142,8 +136,6 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k const size_t a_one, const size_t a_two, const bool a_want_rotated, const size_t b_one, const size_t b_two, const bool b_want_rotated, const size_t c_one, const size_t c_two, const bool c_want_rotated) { - auto status = StatusCode::kSuccess; - // Calculates the ceiled versions of m, n, and k const auto m_ceiled = Ceil(m, db_["MWG"]); const auto n_ceiled = Ceil(n, db_["NWG"]); @@ -158,109 +150,95 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled; const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled; - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 && - a_do_transpose == false && a_conjugate == false; - auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 && - b_do_transpose == false && b_conjugate == false; - auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 && - c_do_transpose == false; - - // Creates the temporary matrices - const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i); - const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i); - const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i); - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector<Event>(); - auto emptyEventList = std::vector<Event>(); - - // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. - if (!a_no_temp) { - auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - a_one_i, a_two_i, a_one_i, 0, a_temp, - ConstantOne<T>(), program, - true, a_do_transpose, a_conjugate); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessA); - } - - // As above, but now for matrix B - if (!b_no_temp) { - auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, - b_one, b_two, b_ld, b_offset, b_buffer, - b_one_i, b_two_i, b_one_i, 0, b_temp, - ConstantOne<T>(), program, - true, b_do_transpose, b_conjugate); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessB); - } - - // As above, but now for matrix C. This is only necessary if C is used both as input and output. - if (!c_no_temp && beta != static_cast<T>(0)) { - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, - c_one, c_two, c_ld, c_offset, c_buffer, - c_one_i, c_two_i, c_one_i, 0, c_temp, - ConstantOne<T>(), program, - true, c_do_transpose, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessC); - } - - // Retrieves the Xgemm kernel from the compiled binary - try { - auto kernel = Kernel(program, "Xgemm"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(m_ceiled)); - kernel.SetArgument(1, static_cast<int>(n_ceiled)); - kernel.SetArgument(2, static_cast<int>(k_ceiled)); - kernel.SetArgument(3, GetRealArg(alpha)); - kernel.SetArgument(4, GetRealArg(beta)); - kernel.SetArgument(5, a_temp()); - kernel.SetArgument(6, b_temp()); - kernel.SetArgument(7, c_temp()); - - // Computes the global and local thread sizes - const auto global = std::vector<size_t>{ - (c_one_i * db_["MDIMC"]) / db_["MWG"], - (c_two_i * db_["NDIMC"]) / db_["NWG"] - }; - const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel = Event(); - auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; - status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel if needed - if (!c_no_temp) { - eventWaitList.push_back(eventKernel); - status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, - c_one_i, c_two_i, c_one_i, 0, c_temp, - c_one, c_two, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, - false, c_do_transpose, false); - if (ErrorIn(status)) { return status; } - } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 && + a_do_transpose == false && a_conjugate == false; + auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 && + b_do_transpose == false && b_conjugate == false; + auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 && + c_do_transpose == false; + + // Creates the temporary matrices + const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i); + const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i); + const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i); + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. + if (!a_no_temp) { + auto eventProcessA = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + a_one_i, a_two_i, a_one_i, 0, a_temp, + ConstantOne<T>(), program, + true, a_do_transpose, a_conjugate); + eventWaitList.push_back(eventProcessA); + } + + // As above, but now for matrix B + if (!b_no_temp) { + auto eventProcessB = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, + b_one, b_two, b_ld, b_offset, b_buffer, + b_one_i, b_two_i, b_one_i, 0, b_temp, + ConstantOne<T>(), program, + true, b_do_transpose, b_conjugate); + eventWaitList.push_back(eventProcessB); + } + + // As above, but now for matrix C. This is only necessary if C is used both as input and output. + if (!c_no_temp && beta != static_cast<T>(0)) { + auto eventProcessC = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, + c_one, c_two, c_ld, c_offset, c_buffer, + c_one_i, c_two_i, c_one_i, 0, c_temp, + ConstantOne<T>(), program, + true, c_do_transpose, false); + eventWaitList.push_back(eventProcessC); + } + + // Retrieves the Xgemm kernel from the compiled binary + auto kernel = Kernel(program, "Xgemm"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(m_ceiled)); + kernel.SetArgument(1, static_cast<int>(n_ceiled)); + kernel.SetArgument(2, static_cast<int>(k_ceiled)); + kernel.SetArgument(3, GetRealArg(alpha)); + kernel.SetArgument(4, GetRealArg(beta)); + kernel.SetArgument(5, a_temp()); + kernel.SetArgument(6, b_temp()); + kernel.SetArgument(7, c_temp()); + + // Computes the global and local thread sizes + const auto global = std::vector<size_t>{ + (c_one_i * db_["MDIMC"]) / db_["MWG"], + (c_two_i * db_["NDIMC"]) / db_["NWG"] + }; + const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel = Event(); + auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; + RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList); + + // Runs the post-processing kernel if needed + if (!c_no_temp) { + eventWaitList.push_back(eventKernel); + PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, + c_one_i, c_two_i, c_one_i, 0, c_temp, + c_one, c_two, c_ld, c_offset, c_buffer, + ConstantOne<T>(), program, + false, c_do_transpose, false); + } } @@ -268,7 +246,7 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k // The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels. template <typename T> -StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k, +void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, @@ -281,46 +259,40 @@ StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k, const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); // Retrieves the proper XgemmDirect kernel from the compiled binary - try { - const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : - (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); - auto kernel = Kernel(program, name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(m)); - kernel.SetArgument(1, static_cast<int>(n)); - kernel.SetArgument(2, static_cast<int>(k)); - kernel.SetArgument(3, GetRealArg(alpha)); - kernel.SetArgument(4, GetRealArg(beta)); - kernel.SetArgument(5, a_buffer()); - kernel.SetArgument(6, static_cast<int>(a_offset)); - kernel.SetArgument(7, static_cast<int>(a_ld)); - kernel.SetArgument(8, b_buffer()); - kernel.SetArgument(9, static_cast<int>(b_offset)); - kernel.SetArgument(10, static_cast<int>(b_ld)); - kernel.SetArgument(11, c_buffer()); - kernel.SetArgument(12, static_cast<int>(c_offset)); - kernel.SetArgument(13, static_cast<int>(c_ld)); - kernel.SetArgument(14, static_cast<int>(c_do_transpose)); - kernel.SetArgument(15, static_cast<int>(a_conjugate)); - kernel.SetArgument(16, static_cast<int>(b_conjugate)); - - // Computes the global and local thread sizes - const auto m_ceiled = Ceil(m, db_["WGD"]); - const auto n_ceiled = Ceil(n, db_["WGD"]); - const auto global = std::vector<size_t>{ - (m_ceiled * db_["MDIMCD"]) / db_["WGD"], - (n_ceiled * db_["NDIMCD"]) / db_["WGD"] - }; - const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]}; - - // Launches the kernel - auto status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : + (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); + auto kernel = Kernel(program, name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(m)); + kernel.SetArgument(1, static_cast<int>(n)); + kernel.SetArgument(2, static_cast<int>(k)); + kernel.SetArgument(3, GetRealArg(alpha)); + kernel.SetArgument(4, GetRealArg(beta)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast<int>(a_offset)); + kernel.SetArgument(7, static_cast<int>(a_ld)); + kernel.SetArgument(8, b_buffer()); + kernel.SetArgument(9, static_cast<int>(b_offset)); + kernel.SetArgument(10, static_cast<int>(b_ld)); + kernel.SetArgument(11, c_buffer()); + kernel.SetArgument(12, static_cast<int>(c_offset)); + kernel.SetArgument(13, static_cast<int>(c_ld)); + kernel.SetArgument(14, static_cast<int>(c_do_transpose)); + kernel.SetArgument(15, static_cast<int>(a_conjugate)); + kernel.SetArgument(16, static_cast<int>(b_conjugate)); + + // Computes the global and local thread sizes + const auto m_ceiled = Ceil(m, db_["WGD"]); + const auto n_ceiled = Ceil(n, db_["WGD"]); + const auto global = std::vector<size_t>{ + (m_ceiled * db_["MDIMCD"]) / db_["WGD"], + (n_ceiled * db_["NDIMCD"]) / db_["WGD"] + }; + const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]}; + + // Launches the kernel + RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp index 46e12453..c61611b6 100644 --- a/src/routines/level3/xgemm.hpp +++ b/src/routines/level3/xgemm.hpp @@ -28,36 +28,36 @@ class Xgemm: public Routine { Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM"); // Templated-precision implementation of the routine - StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, + void DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + + // Indirect version of GEMM (with pre and post-processing kernels) + void GemmIndirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); - - // Indirect version of GEMM (with pre and post-processing kernels) - StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld, - const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, - const bool a_conjugate, const bool b_conjugate, - const size_t a_one, const size_t a_two, const bool a_want_rotated, - const size_t b_one, const size_t b_two, const bool b_want_rotated, - const size_t c_one, const size_t c_two, const bool c_want_rotated); + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate, + const size_t a_one, const size_t a_two, const bool a_want_rotated, + const size_t b_one, const size_t b_two, const bool b_want_rotated, + const size_t c_one, const size_t c_two, const bool c_want_rotated); // Direct version of GEMM (no pre and post-processing kernels) - StatusCode GemmDirect(const size_t m, const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld, - const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, - const bool a_conjugate, const bool b_conjugate); + void GemmDirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate); }; // ================================================================================================= diff --git a/src/routines/level3/xhemm.cpp b/src/routines/level3/xhemm.cpp index 9813503e..e5b1502a 100644 --- a/src/routines/level3/xhemm.cpp +++ b/src/routines/level3/xhemm.cpp @@ -29,7 +29,7 @@ Xhemm<T>::Xhemm(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle, +void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, @@ -38,15 +38,14 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } + if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the // left) or B (on the right) in the Xgemm routine. auto k = (side == Side::kLeft) ? m : n; // Checks for validity of the squared A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(k, k, a_buffer, a_offset, a_ld); // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix @@ -55,73 +54,68 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared"; // Temporary buffer for a copy of the hermitian matrix - try { - auto temp_herm = Buffer<T>(context_, k*k); - - // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm - // routine afterwards + auto temp_herm = Buffer<T>(context_, k*k); + + // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm + // routine afterwards + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the hermitian-to-squared kernel + kernel.SetArgument(0, static_cast<int>(k)); + kernel.SetArgument(1, static_cast<int>(a_ld)); + kernel.SetArgument(2, static_cast<int>(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast<int>(k)); + kernel.SetArgument(5, static_cast<int>(k)); + kernel.SetArgument(6, static_cast<int>(0)); + kernel.SetArgument(7, temp_herm()); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // hermitian-to-squared kernel uses the same parameters. + auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + auto kernelEvent = Event(); + RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); + + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + + // Runs the regular Xgemm code with either "C := AB+C" or ... + if (side == Side::kLeft) { + DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + temp_herm, 0, k, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld); + } + + // ... with "C := BA+C". Note that A and B are now reversed. + else { try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the hermitian-to-squared kernel - kernel.SetArgument(0, static_cast<int>(k)); - kernel.SetArgument(1, static_cast<int>(a_ld)); - kernel.SetArgument(2, static_cast<int>(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast<int>(k)); - kernel.SetArgument(5, static_cast<int>(k)); - kernel.SetArgument(6, static_cast<int>(0)); - kernel.SetArgument(7, temp_herm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // hermitian-to-squared kernel uses the same parameters. - auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - - // Synchronize now: 'DoGemm' does not accept a list of events to wait for - kernelEvent.WaitForCompletion(); - - // Runs the regular Xgemm code with either "C := AB+C" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - temp_herm, 0, k, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld); - } - - // ... with "C := BA+C". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_herm, 0, k, - beta, - c_buffer, c_offset, c_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } + DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_herm, 0, k, + beta, + c_buffer, c_offset, c_ld); + } catch (BLASError &e) { + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(e.status()) { + case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details()); + case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details()); + case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details()); + case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details()); + case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details()); + case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details()); + default: throw; } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + } + } } // ================================================================================================= diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp index 272bd2ec..2385706e 100644 --- a/src/routines/level3/xhemm.hpp +++ b/src/routines/level3/xhemm.hpp @@ -37,13 +37,13 @@ class Xhemm: public Xgemm<T> { Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM"); // Templated-precision implementation of the routine - StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + void DoHemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index bf328729..ee3bb8b8 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -22,8 +22,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T, typename U> Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" @@ -32,23 +31,23 @@ Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T, typename U> -StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const U beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { +void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or // to matrix A (argument: conjugate transpose) @@ -71,12 +70,9 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co // matrix A cannot be less than N when rotated, or less than K when not-rotated // matrix B cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N - auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld); + TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld); + TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); @@ -85,145 +81,128 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - ab_rotated == false && ab_conjugate == false; - auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - ab_rotated == false && ab_conjugate == true; - auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && - ab_rotated == false && ab_conjugate == false; - auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && - ab_rotated == false && ab_conjugate == true; - - // Creates the temporary matrices - auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); - - // Convert the arguments to complex versions - auto complex_beta = T{beta, static_cast<U>(0.0)}; - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector<Event>(); - auto emptyEventList = std::vector<Event>(); - - // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. - if (!a1_no_temp) { - auto eventProcessA1 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList, - ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, - ConstantOne<T>(), program, - true, ab_rotated, ab_conjugate); - eventWaitList.push_back(eventProcessA1); - if (ErrorIn(status)) { return status; } - } - if (!a2_no_temp) { - auto eventProcessA2 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList, - ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, - ConstantOne<T>(), program, - true, ab_rotated, !ab_conjugate); - eventWaitList.push_back(eventProcessA2); - if (ErrorIn(status)) { return status; } - } - if (!b1_no_temp) { - auto eventProcessB1 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList, - ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, - ConstantOne<T>(), program, - true, ab_rotated, ab_conjugate); - eventWaitList.push_back(eventProcessB1); - if (ErrorIn(status)) { return status; } - } - if (!b2_no_temp) { - auto eventProcessB2 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList, - ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, - ConstantOne<T>(), program, - true, ab_rotated, !ab_conjugate); - eventWaitList.push_back(eventProcessB2); - if (ErrorIn(status)) { return status; } - } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, - n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, - true, c_rotated, false); - eventWaitList.push_back(eventProcessC); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(n_ceiled)); - kernel.SetArgument(1, static_cast<int>(k_ceiled)); - kernel.SetArgument(2, GetRealArg(alpha)); - kernel.SetArgument(3, GetRealArg(complex_beta)); - kernel.SetArgument(4, a1_temp()); - kernel.SetArgument(5, b2_temp()); - kernel.SetArgument(6, c_temp()); - - // Computes the global and local thread sizes - auto global = std::vector<size_t>{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel1 = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel1); - - // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha - auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; - auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)}; - kernel.SetArgument(2, GetRealArg(conjugate_alpha)); - kernel.SetArgument(3, GetRealArg(complex_one)); - kernel.SetArgument(4, b1_temp()); - kernel.SetArgument(5, a2_temp()); - - // Runs the kernel again - auto eventKernel2 = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel2); - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, - false, c_rotated, false, upper, lower, true); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + ab_rotated == false && ab_conjugate == false; + auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + ab_rotated == false && ab_conjugate == true; + auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && + ab_rotated == false && ab_conjugate == false; + auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && + ab_rotated == false && ab_conjugate == true; + + // Creates the temporary matrices + auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + + // Convert the arguments to complex versions + auto complex_beta = T{beta, static_cast<U>(0.0)}; + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. + if (!a1_no_temp) { + auto eventProcessA1 = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, + ConstantOne<T>(), program, + true, ab_rotated, ab_conjugate); + eventWaitList.push_back(eventProcessA1); + } + if (!a2_no_temp) { + auto eventProcessA2 = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, + ConstantOne<T>(), program, + true, ab_rotated, !ab_conjugate); + eventWaitList.push_back(eventProcessA2); + } + if (!b1_no_temp) { + auto eventProcessB1 = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, + ConstantOne<T>(), program, + true, ab_rotated, ab_conjugate); + eventWaitList.push_back(eventProcessB1); + } + if (!b2_no_temp) { + auto eventProcessB2 = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, + ConstantOne<T>(), program, + true, ab_rotated, !ab_conjugate); + eventWaitList.push_back(eventProcessB2); + } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + auto eventProcessC = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + ConstantOne<T>(), program, + true, c_rotated, false); + eventWaitList.push_back(eventProcessC); + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n_ceiled)); + kernel.SetArgument(1, static_cast<int>(k_ceiled)); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(complex_beta)); + kernel.SetArgument(4, a1_temp()); + kernel.SetArgument(5, b2_temp()); + kernel.SetArgument(6, c_temp()); + + // Computes the global and local thread sizes + auto global = std::vector<size_t>{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel1 = Event(); + RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); + eventWaitList.push_back(eventKernel1); + + // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha + auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; + auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)}; + kernel.SetArgument(2, GetRealArg(conjugate_alpha)); + kernel.SetArgument(3, GetRealArg(complex_one)); + kernel.SetArgument(4, b1_temp()); + kernel.SetArgument(5, a2_temp()); + + // Runs the kernel again + auto eventKernel2 = Event(); + RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); + eventWaitList.push_back(eventKernel2); + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + n, n, c_ld, c_offset, c_buffer, + ConstantOne<T>(), program, + false, c_rotated, false, upper, lower, true); } // ================================================================================================= diff --git a/src/routines/level3/xher2k.hpp b/src/routines/level3/xher2k.hpp index 23996219..acc346e4 100644 --- a/src/routines/level3/xher2k.hpp +++ b/src/routines/level3/xher2k.hpp @@ -30,13 +30,13 @@ class Xher2k: public Routine { Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K"); // Templated-precision implementation of the routine - StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const U beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + void DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index 77422526..ae8e9324 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -22,8 +22,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T, typename U> Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" @@ -32,14 +31,14 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T, typename U> -StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const U alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, @@ -47,7 +46,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or // to matrix A (argument: conjugate transpose) @@ -70,10 +69,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons // space. Also tests that the leading dimensions of: // matrix A cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); @@ -82,106 +79,92 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - a_rotated == false && a_conjugate == false; - auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - a_rotated == false && b_conjugate == false; - - // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); - - // Convert the arguments to complex versions - auto complex_alpha = T{alpha, static_cast<U>(0.0)}; - auto complex_beta = T{beta, static_cast<U>(0.0)}; - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector<Event>(); - auto emptyEventList = std::vector<Event>(); - - // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. Two copies are created. - if (!a_no_temp) { - auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, - true, a_rotated, a_conjugate); - eventWaitList.push_back(eventProcessA); - if (ErrorIn(status)) { return status; } - } - if (!b_no_temp) { - auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, - true, a_rotated, b_conjugate); - eventWaitList.push_back(eventProcessB); - if (ErrorIn(status)) { return status; } - } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, - n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, - true, c_rotated, false); - eventWaitList.push_back(eventProcessC); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(n_ceiled)); - kernel.SetArgument(1, static_cast<int>(k_ceiled)); - kernel.SetArgument(2, GetRealArg(complex_alpha)); - kernel.SetArgument(3, GetRealArg(complex_beta)); - kernel.SetArgument(4, a_temp()); - kernel.SetArgument(5, b_temp()); - kernel.SetArgument(6, c_temp()); - - // Computes the global and local thread sizes - auto global = std::vector<size_t>{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel); - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, - false, c_rotated, false, upper, lower, true); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + a_rotated == false && a_conjugate == false; + auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + a_rotated == false && b_conjugate == false; + + // Creates the temporary matrices + auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + + // Convert the arguments to complex versions + auto complex_alpha = T{alpha, static_cast<U>(0.0)}; + auto complex_beta = T{beta, static_cast<U>(0.0)}; + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. Two copies are created. + if (!a_no_temp) { + auto eventProcessA = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a_temp, + ConstantOne<T>(), program, + true, a_rotated, a_conjugate); + eventWaitList.push_back(eventProcessA); + } + if (!b_no_temp) { + auto eventProcessB = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b_temp, + ConstantOne<T>(), program, + true, a_rotated, b_conjugate); + eventWaitList.push_back(eventProcessB); + } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + auto eventProcessC = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + ConstantOne<T>(), program, + true, c_rotated, false); + eventWaitList.push_back(eventProcessC); + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n_ceiled)); + kernel.SetArgument(1, static_cast<int>(k_ceiled)); + kernel.SetArgument(2, GetRealArg(complex_alpha)); + kernel.SetArgument(3, GetRealArg(complex_beta)); + kernel.SetArgument(4, a_temp()); + kernel.SetArgument(5, b_temp()); + kernel.SetArgument(6, c_temp()); + + // Computes the global and local thread sizes + auto global = std::vector<size_t>{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel = Event(); + RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); + eventWaitList.push_back(eventKernel); + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + n, n, c_ld, c_offset, c_buffer, + ConstantOne<T>(), program, + false, c_rotated, false, upper, lower, true); } // ================================================================================================= diff --git a/src/routines/level3/xherk.hpp b/src/routines/level3/xherk.hpp index 3f156a1b..51f29d7e 100644 --- a/src/routines/level3/xherk.hpp +++ b/src/routines/level3/xherk.hpp @@ -30,12 +30,12 @@ class Xherk: public Routine { Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK"); // Templated-precision implementation of the routine - StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const U alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const U beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + void DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const U alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const U beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/src/routines/level3/xsymm.cpp b/src/routines/level3/xsymm.cpp index 04e4b718..d7f771d1 100644 --- a/src/routines/level3/xsymm.cpp +++ b/src/routines/level3/xsymm.cpp @@ -29,7 +29,7 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle, +void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, @@ -38,15 +38,14 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } + if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the // left) or B (on the right) in the Xgemm routine. auto k = (side == Side::kLeft) ? m : n; // Checks for validity of the squared A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(k, k, a_buffer, a_offset, a_ld); // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix @@ -55,73 +54,68 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; // Temporary buffer for a copy of the symmetric matrix - try { - auto temp_symm = Buffer<T>(context_, k*k); - - // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm - // routine afterwards + auto temp_symm = Buffer<T>(context_, k*k); + + // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm + // routine afterwards + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the symmetric-to-squared kernel + kernel.SetArgument(0, static_cast<int>(k)); + kernel.SetArgument(1, static_cast<int>(a_ld)); + kernel.SetArgument(2, static_cast<int>(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast<int>(k)); + kernel.SetArgument(5, static_cast<int>(k)); + kernel.SetArgument(6, static_cast<int>(0)); + kernel.SetArgument(7, temp_symm()); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // symmetric-to-squared kernel uses the same parameters. + auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + auto kernelEvent = Event(); + RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); + + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + + // Runs the regular Xgemm code with either "C := AB+C" or ... + if (side == Side::kLeft) { + DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + temp_symm, 0, k, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld); + } + + // ... with "C := BA+C". Note that A and B are now reversed. + else { try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the symmetric-to-squared kernel - kernel.SetArgument(0, static_cast<int>(k)); - kernel.SetArgument(1, static_cast<int>(a_ld)); - kernel.SetArgument(2, static_cast<int>(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast<int>(k)); - kernel.SetArgument(5, static_cast<int>(k)); - kernel.SetArgument(6, static_cast<int>(0)); - kernel.SetArgument(7, temp_symm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // symmetric-to-squared kernel uses the same parameters. - auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - - // Synchronize now: 'DoGemm' does not accept a list of events to wait for - kernelEvent.WaitForCompletion(); - - // Runs the regular Xgemm code with either "C := AB+C" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - temp_symm, 0, k, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld); - } - - // ... with "C := BA+C". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_symm, 0, k, - beta, - c_buffer, c_offset, c_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } + DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_symm, 0, k, + beta, + c_buffer, c_offset, c_ld); + } catch (BLASError &e) { + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(e.status()) { + case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details()); + case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details()); + case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details()); + case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details()); + case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details()); + case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details()); + default: throw; } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + } + } } // ================================================================================================= diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp index 428f78ef..ee965364 100644 --- a/src/routines/level3/xsymm.hpp +++ b/src/routines/level3/xsymm.hpp @@ -39,13 +39,13 @@ class Xsymm: public Xgemm<T> { Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM"); // Templated-precision implementation of the routine - StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + void DoSymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index badf3100..cb0e0461 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -22,8 +22,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" @@ -32,14 +31,14 @@ Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, @@ -48,7 +47,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrices are transposed in memory. This is based on their layout // (row or column-major) and whether or not they are requested to be pre-transposed. @@ -67,12 +66,9 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons // matrix A cannot be less than N when rotated, or less than K when not-rotated // matrix B cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N - auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld); + TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld); + TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); @@ -81,114 +77,99 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - ab_rotated == false; - auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && - ab_rotated == false; - - // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector<Event>(); - auto emptyEventList = std::vector<Event>(); - - // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. - if (!a_no_temp) { - auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, - ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, - true, ab_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessA); - } - if (!b_no_temp) { - auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, - ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, - true, ab_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessB); - } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, - n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, - true, c_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessC); - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(n_ceiled)); - kernel.SetArgument(1, static_cast<int>(k_ceiled)); - kernel.SetArgument(2, GetRealArg(alpha)); - kernel.SetArgument(3, GetRealArg(beta)); - kernel.SetArgument(4, a_temp()); - kernel.SetArgument(5, b_temp()); - kernel.SetArgument(6, c_temp()); - - // Computes the global and local thread sizes - auto global = std::vector<size_t>{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel1 = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel1); - - // Swaps the arguments for matrices A and B, and sets 'beta' to 1 - auto one = static_cast<T>(1); - kernel.SetArgument(3, GetRealArg(one)); - kernel.SetArgument(4, b_temp()); - kernel.SetArgument(5, a_temp()); - - // Runs the kernel again - auto eventKernel2 = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel2); - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, - false, c_rotated, false, upper, lower, false); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + ab_rotated == false; + auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && + ab_rotated == false; + + // Creates the temporary matrices + auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. + if (!a_no_temp) { + auto eventProcessA = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a_temp, + ConstantOne<T>(), program, + true, ab_rotated, false); + eventWaitList.push_back(eventProcessA); + } + if (!b_no_temp) { + auto eventProcessB = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b_temp, + ConstantOne<T>(), program, + true, ab_rotated, false); + eventWaitList.push_back(eventProcessB); + } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + auto eventProcessC = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + ConstantOne<T>(), program, + true, c_rotated, false); + eventWaitList.push_back(eventProcessC); + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n_ceiled)); + kernel.SetArgument(1, static_cast<int>(k_ceiled)); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); + kernel.SetArgument(4, a_temp()); + kernel.SetArgument(5, b_temp()); + kernel.SetArgument(6, c_temp()); + + // Computes the global and local thread sizes + auto global = std::vector<size_t>{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel1 = Event(); + RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); + eventWaitList.push_back(eventKernel1); + + // Swaps the arguments for matrices A and B, and sets 'beta' to 1 + auto one = static_cast<T>(1); + kernel.SetArgument(3, GetRealArg(one)); + kernel.SetArgument(4, b_temp()); + kernel.SetArgument(5, a_temp()); + + // Runs the kernel again + auto eventKernel2 = Event(); + RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); + eventWaitList.push_back(eventKernel2); + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + n, n, c_ld, c_offset, c_buffer, + ConstantOne<T>(), program, + false, c_rotated, false, upper, lower, false); } // ================================================================================================= diff --git a/src/routines/level3/xsyr2k.hpp b/src/routines/level3/xsyr2k.hpp index 56185653..a02c6e16 100644 --- a/src/routines/level3/xsyr2k.hpp +++ b/src/routines/level3/xsyr2k.hpp @@ -30,13 +30,13 @@ class Xsyr2k: public Routine { Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K"); // Templated-precision implementation of the routine - StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + void DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index 438aa218..bd6c4b25 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -22,8 +22,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" @@ -32,14 +31,14 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, @@ -47,7 +46,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrices are transposed in memory. This is based on their layout // (row or column-major) and whether or not they are requested to be pre-transposed. @@ -65,10 +64,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const // space. Also tests that the leading dimensions of: // matrix A cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); @@ -77,90 +74,76 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - a_rotated == false; - - // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector<Event>(); - auto emptyEventList = std::vector<Event>(); - - // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. - if (!a_no_temp) { - auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, - true, a_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessA); - } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, - n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, - true, c_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessC); - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(n_ceiled)); - kernel.SetArgument(1, static_cast<int>(k_ceiled)); - kernel.SetArgument(2, GetRealArg(alpha)); - kernel.SetArgument(3, GetRealArg(beta)); - kernel.SetArgument(4, a_temp()); - kernel.SetArgument(5, a_temp()); - kernel.SetArgument(6, c_temp()); - - // Computes the global and local thread sizes - auto global = std::vector<size_t>{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel); - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, - false, c_rotated, false, upper, lower, false); - if (ErrorIn(status)) { return status; } - - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + a_rotated == false; + + // Creates the temporary matrices + auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. + if (!a_no_temp) { + auto eventProcessA = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a_temp, + ConstantOne<T>(), program, + true, a_rotated, false); + eventWaitList.push_back(eventProcessA); + } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + auto eventProcessC = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + ConstantOne<T>(), program, + true, c_rotated, false); + eventWaitList.push_back(eventProcessC); + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n_ceiled)); + kernel.SetArgument(1, static_cast<int>(k_ceiled)); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); + kernel.SetArgument(4, a_temp()); + kernel.SetArgument(5, a_temp()); + kernel.SetArgument(6, c_temp()); + + // Computes the global and local thread sizes + auto global = std::vector<size_t>{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel = Event(); + RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); + eventWaitList.push_back(eventKernel); + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + n, n, c_ld, c_offset, c_buffer, + ConstantOne<T>(), program, + false, c_rotated, false, upper, lower, false); } // ================================================================================================= diff --git a/src/routines/level3/xsyrk.hpp b/src/routines/level3/xsyrk.hpp index 7c075c26..de42b824 100644 --- a/src/routines/level3/xsyrk.hpp +++ b/src/routines/level3/xsyrk.hpp @@ -32,12 +32,12 @@ class Xsyrk: public Routine { Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK"); // Templated-precision implementation of the routine - StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + void DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp index 74a82822..6bf77cfa 100644 --- a/src/routines/level3/xtrmm.cpp +++ b/src/routines/level3/xtrmm.cpp @@ -29,7 +29,7 @@ Xtrmm<T>::Xtrmm(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle, +void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, @@ -37,15 +37,14 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) { // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } + if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Computes the k dimension. This is based on whether or not matrix is A (on the left) // or B (on the right) in the Xgemm routine. auto k = (side == Side::kLeft) ? m : n; // Checks for validity of the triangular A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(k, k, a_buffer, a_offset, a_ld); // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix @@ -57,74 +56,69 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false; // Temporary buffer for a copy of the triangular matrix - try { - auto temp_triangular = Buffer<T>(context_, k*k); - - // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm - // routine afterwards + auto temp_triangular = Buffer<T>(context_, k*k); + + // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm + // routine afterwards + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the triangular-to-squared kernel + kernel.SetArgument(0, static_cast<int>(k)); + kernel.SetArgument(1, static_cast<int>(a_ld)); + kernel.SetArgument(2, static_cast<int>(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast<int>(k)); + kernel.SetArgument(5, static_cast<int>(k)); + kernel.SetArgument(6, static_cast<int>(0)); + kernel.SetArgument(7, temp_triangular()); + kernel.SetArgument(8, static_cast<int>(unit_diagonal)); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // triangular-to-squared kernel uses the same parameters. + auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + auto kernelEvent = Event(); + RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); + + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + + // Runs the regular Xgemm code with either "B := alpha*A*B" or ... + if (side == Side::kLeft) { + DoGemm(layout, a_transpose, Transpose::kNo, + m, n, k, + alpha, + temp_triangular, 0, k, + b_buffer, b_offset, b_ld, + static_cast<T>(0.0), + b_buffer, b_offset, b_ld); + } + + // ... with "B := alpha*B*A". Note that A and B are now reversed. + else { try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the triangular-to-squared kernel - kernel.SetArgument(0, static_cast<int>(k)); - kernel.SetArgument(1, static_cast<int>(a_ld)); - kernel.SetArgument(2, static_cast<int>(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast<int>(k)); - kernel.SetArgument(5, static_cast<int>(k)); - kernel.SetArgument(6, static_cast<int>(0)); - kernel.SetArgument(7, temp_triangular()); - kernel.SetArgument(8, static_cast<int>(unit_diagonal)); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // triangular-to-squared kernel uses the same parameters. - auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - - // Synchronize now: 'DoGemm' does not accept a list of events to wait for - kernelEvent.WaitForCompletion(); - - // Runs the regular Xgemm code with either "B := alpha*A*B" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, a_transpose, Transpose::kNo, - m, n, k, - alpha, - temp_triangular, 0, k, - b_buffer, b_offset, b_ld, - static_cast<T>(0.0), - b_buffer, b_offset, b_ld); - } - - // ... with "B := alpha*B*A". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, a_transpose, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_triangular, 0, k, - static_cast<T>(0.0), - b_buffer, b_offset, b_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } + DoGemm(layout, Transpose::kNo, a_transpose, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_triangular, 0, k, + static_cast<T>(0.0), + b_buffer, b_offset, b_ld); + } catch (BLASError &e) { + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(e.status()) { + case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details()); + case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details()); + case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details()); + case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details()); + case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details()); + case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details()); + default: throw; } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + } + } } // ================================================================================================= diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp index 186a120e..967bf132 100644 --- a/src/routines/level3/xtrmm.hpp +++ b/src/routines/level3/xtrmm.hpp @@ -38,12 +38,12 @@ class Xtrmm: public Xgemm<T> { Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM"); // Templated-precision implementation of the routine - StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld); + void DoTrmm(const Layout layout, const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld); }; // ================================================================================================= diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp index af9080af..875ca7d2 100644 --- a/src/routines/levelx/xomatcopy.cpp +++ b/src/routines/levelx/xomatcopy.cpp @@ -22,27 +22,26 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xomatcopy<T>::Xomatcopy(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" #include "../../kernels/level3/transpose_fast.opencl" #include "../../kernels/level3/transpose_pad.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) { +void Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) { // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } + if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Determines whether to transpose the matrix A const auto transpose = (a_transpose != Transpose::kNo); @@ -63,22 +62,17 @@ StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_trans // Also tests that the leading dimensions of: // matrix A cannot be less than N when rotated, or less than M when not-rotated // matrix B cannot be less than M when rotated, or less than N when not-rotated - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); // Loads the program from the database const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); auto emptyEventList = std::vector<Event>(); - status = PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - b_one, b_two, b_ld, b_offset, b_buffer, - alpha, program, false, transpose, conjugate); - if (ErrorIn(status)) { return status; } - - return StatusCode::kSuccess; + PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + b_one, b_two, b_ld, b_offset, b_buffer, + alpha, program, false, transpose, conjugate); } // ================================================================================================= diff --git a/src/routines/levelx/xomatcopy.hpp b/src/routines/levelx/xomatcopy.hpp index 0e580230..2da66693 100644 --- a/src/routines/levelx/xomatcopy.hpp +++ b/src/routines/levelx/xomatcopy.hpp @@ -28,10 +28,10 @@ class Xomatcopy: public Routine { Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY"); // Templated-precision implementation of the routine - StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld); + void DoOmatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld); }; // ================================================================================================= diff --git a/src/utilities.hpp b/src/utilities.hpp index 038a8a96..a7fcbd25 100644 --- a/src/utilities.hpp +++ b/src/utilities.hpp @@ -24,6 +24,7 @@ #include "clblast.h" #include "clblast_half.h" #include "clpp11.hpp" +#include "clblast_exceptions.hpp" #include "msvc.hpp" @@ -207,11 +208,6 @@ bool CheckArgument(const int argc, char *argv[], std::string &help, const std::s // ================================================================================================= -// Helper function to check for errors in the status code -inline bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); } - -// ================================================================================================= - // Returns a random number to be used as a seed unsigned int GetRandomSeed(); |