diff options
Diffstat (limited to 'src/routines')
87 files changed, 1858 insertions, 2103 deletions
diff --git a/src/routines/common.cpp b/src/routines/common.cpp index 3969cf9f..c995dc12 100644 --- a/src/routines/common.cpp +++ b/src/routines/common.cpp @@ -20,22 +20,26 @@ namespace clblast { // ================================================================================================= // Enqueues a kernel, waits for completion, and checks for errors -StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, - std::vector<size_t> global, const std::vector<size_t> &local, - EventPointer event, const std::vector<Event> &waitForEvents) { +void RunKernel(Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local, + EventPointer event, const std::vector<Event> &waitForEvents) { if (!local.empty()) { // Tests for validity of the local thread sizes if (local.size() > device.MaxWorkItemDimensions()) { - return StatusCode::kInvalidLocalNumDimensions; + throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions); } const auto max_work_item_sizes = device.MaxWorkItemSizes(); for (auto i=size_t{0}; i<local.size(); ++i) { - if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; } + if (local[i] > max_work_item_sizes[i]) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim); + } } auto local_size = size_t{1}; for (auto &item: local) { local_size *= item; } - if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; } + if (local_size > device.MaxWorkGroupSize()) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal); + } // Make sure the global thread sizes are at least equal to the local sizes for (auto i=size_t{0}; i<global.size(); ++i) { @@ -45,7 +49,9 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, // Tests for local memory usage const auto local_mem_usage = kernel.LocalMemUsage(device); - if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; } + if (!device.IsLocalMemoryValid(local_mem_usage)) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage); + } // Prints the name of the kernel to launch in case of debugging in verbose mode #ifdef VERBOSE @@ -55,9 +61,7 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, #endif // Launches the kernel (and checks for launch errors) - try { - kernel.Launch(queue, global, local, event, waitForEvents); - } catch (...) { return StatusCode::kKernelLaunchError; } + kernel.Launch(queue, global, local, event, waitForEvents); // Prints the elapsed execution time in case of debugging in verbose mode #ifdef VERBOSE @@ -66,9 +70,6 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count(); printf("[DEBUG] Completed kernel in %.2lf ms\n", timing); #endif - - // No errors, normal termination of this function - return StatusCode::kSuccess; } // ================================================================================================= diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 9d8849c3..802abec4 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -27,29 +27,29 @@ namespace clblast { // ================================================================================================= // Enqueues a kernel, waits for completion, and checks for errors -StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, - std::vector<size_t> global, const std::vector<size_t> &local, - EventPointer event, const std::vector<Event> &waitForEvents = {}); +void RunKernel(Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local, + EventPointer event, const std::vector<Event> &waitForEvents = {}); // ================================================================================================= // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able // to write to symmetric and triangular matrices through optional arguments. template <typename T> -StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, - const Database &db, - EventPointer event, const std::vector<Event> &waitForEvents, - const size_t src_one, const size_t src_two, - const size_t src_ld, const size_t src_offset, - const Buffer<T> &src, - const size_t dest_one, const size_t dest_two, - const size_t dest_ld, const size_t dest_offset, - const Buffer<T> &dest, - const T alpha, - const Program &program, const bool do_pad, - const bool do_transpose, const bool do_conjugate, - const bool upper = false, const bool lower = false, - const bool diagonal_imag_zero = false) { +void PadCopyTransposeMatrix(Queue &queue, const Device &device, + const Database &db, + EventPointer event, const std::vector<Event> &waitForEvents, + const size_t src_one, const size_t src_two, + const size_t src_ld, const size_t src_offset, + const Buffer<T> &src, + const size_t dest_one, const size_t dest_two, + const size_t dest_ld, const size_t dest_offset, + const Buffer<T> &dest, + const T alpha, + const Program &program, const bool do_pad, + const bool do_transpose, const bool do_conjugate, + const bool upper = false, const bool lower = false, + const bool diagonal_imag_zero = false) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && @@ -84,77 +84,75 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, } // Retrieves the kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program, kernel_name); - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(src_ld)); - kernel.SetArgument(1, src()); - kernel.SetArgument(2, dest()); - kernel.SetArgument(3, GetRealArg(alpha)); + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(src_ld)); + kernel.SetArgument(1, src()); + kernel.SetArgument(2, dest()); + kernel.SetArgument(3, GetRealArg(alpha)); + } + else { + kernel.SetArgument(0, static_cast<int>(src_one)); + kernel.SetArgument(1, static_cast<int>(src_two)); + kernel.SetArgument(2, static_cast<int>(src_ld)); + kernel.SetArgument(3, static_cast<int>(src_offset)); + kernel.SetArgument(4, src()); + kernel.SetArgument(5, static_cast<int>(dest_one)); + kernel.SetArgument(6, static_cast<int>(dest_two)); + kernel.SetArgument(7, static_cast<int>(dest_ld)); + kernel.SetArgument(8, static_cast<int>(dest_offset)); + kernel.SetArgument(9, dest()); + kernel.SetArgument(10, GetRealArg(alpha)); + if (do_pad) { + kernel.SetArgument(11, static_cast<int>(do_conjugate)); } else { - kernel.SetArgument(0, static_cast<int>(src_one)); - kernel.SetArgument(1, static_cast<int>(src_two)); - kernel.SetArgument(2, static_cast<int>(src_ld)); - kernel.SetArgument(3, static_cast<int>(src_offset)); - kernel.SetArgument(4, src()); - kernel.SetArgument(5, static_cast<int>(dest_one)); - kernel.SetArgument(6, static_cast<int>(dest_two)); - kernel.SetArgument(7, static_cast<int>(dest_ld)); - kernel.SetArgument(8, static_cast<int>(dest_offset)); - kernel.SetArgument(9, dest()); - kernel.SetArgument(10, GetRealArg(alpha)); - if (do_pad) { - kernel.SetArgument(11, static_cast<int>(do_conjugate)); - } - else { - kernel.SetArgument(11, static_cast<int>(upper)); - kernel.SetArgument(12, static_cast<int>(lower)); - kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero)); - } + kernel.SetArgument(11, static_cast<int>(upper)); + kernel.SetArgument(12, static_cast<int>(lower)); + kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero)); } + } - // Launches the kernel and returns the error code. Uses global and local thread sizes based on - // parameters in the database. - if (do_transpose) { - if (use_fast_kernel) { - const auto global = std::vector<size_t>{ - dest_one / db["TRA_WPT"], - dest_two / db["TRA_WPT"] - }; - const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]}; - return RunKernel(kernel, queue, device, global, local, event, waitForEvents); - } - else { - const auto global = std::vector<size_t>{ - Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), - Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]) - }; - const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]}; - return RunKernel(kernel, queue, device, global, local, event, waitForEvents); - } + // Launches the kernel and returns the error code. Uses global and local thread sizes based on + // parameters in the database. + if (do_transpose) { + if (use_fast_kernel) { + const auto global = std::vector<size_t>{ + dest_one / db["TRA_WPT"], + dest_two / db["TRA_WPT"] + }; + const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { - if (use_fast_kernel) { - const auto global = std::vector<size_t>{ - dest_one / db["COPY_VW"], - dest_two / db["COPY_WPT"] - }; - const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]}; - return RunKernel(kernel, queue, device, global, local, event, waitForEvents); - } - else { - const auto global = std::vector<size_t>{ - Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), - Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]) - }; - const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]}; - return RunKernel(kernel, queue, device, global, local, event, waitForEvents); - } + const auto global = std::vector<size_t>{ + Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), + Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]) + }; + const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); } - } catch (...) { return StatusCode::kInvalidKernel; } + } + else { + if (use_fast_kernel) { + const auto global = std::vector<size_t>{ + dest_one / db["COPY_VW"], + dest_two / db["COPY_WPT"] + }; + const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + else { + const auto global = std::vector<size_t>{ + Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), + Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]) + }; + const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + } } // ================================================================================================= diff --git a/src/routines/level1/xamax.cpp b/src/routines/level1/xamax.cpp index 6b6e7f9e..e9efa1a7 100644 --- a/src/routines/level1/xamax.cpp +++ b/src/routines/level1/xamax.cpp @@ -22,74 +22,64 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/xamax.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xamax<T>::DoAmax(const size_t n, - const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xamax<T>::DoAmax(const size_t n, + const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorIndex(1, imax_buffer, imax_offset); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorIndex(1, imax_buffer, imax_offset); // Retrieves the Xamax kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xamax"); - auto kernel2 = Kernel(program, "XamaxEpilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer1 = Buffer<T>(context_, temp_size); - auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast<int>(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast<int>(x_offset)); - kernel1.SetArgument(3, static_cast<int>(x_inc)); - kernel1.SetArgument(4, temp_buffer1()); - kernel1.SetArgument(5, temp_buffer2()); - - // Event waiting list - auto eventWaitList = std::vector<Event>(); - - // Launches the main kernel - auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; - auto local1 = std::vector<size_t>{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer1()); - kernel2.SetArgument(1, temp_buffer2()); - kernel2.SetArgument(2, imax_buffer()); - kernel2.SetArgument(3, static_cast<int>(imax_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector<size_t>{db_["WGS2"]}; - auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel1 = Kernel(program, "Xamax"); + auto kernel2 = Kernel(program, "XamaxEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer1 = Buffer<T>(context_, temp_size); + auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast<int>(x_offset)); + kernel1.SetArgument(3, static_cast<int>(x_inc)); + kernel1.SetArgument(4, temp_buffer1()); + kernel1.SetArgument(5, temp_buffer2()); + + // Event waiting list + auto eventWaitList = std::vector<Event>(); + + // Launches the main kernel + auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; + auto local1 = std::vector<size_t>{db_["WGS1"]}; + auto kernelEvent = Event(); + RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer1()); + kernel2.SetArgument(1, temp_buffer2()); + kernel2.SetArgument(2, imax_buffer()); + kernel2.SetArgument(3, static_cast<int>(imax_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector<size_t>{db_["WGS2"]}; + auto local2 = std::vector<size_t>{db_["WGS2"]}; + RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= diff --git a/src/routines/level1/xamax.hpp b/src/routines/level1/xamax.hpp index aa45a8e4..4d1e0082 100644 --- a/src/routines/level1/xamax.hpp +++ b/src/routines/level1/xamax.hpp @@ -28,9 +28,9 @@ class Xamax: public Routine { Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX"); // Templated-precision implementation of the routine - StatusCode DoAmax(const size_t n, - const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoAmax(const size_t n, + const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xasum.cpp b/src/routines/level1/xasum.cpp index 0c1ce903..a242a5fa 100644 --- a/src/routines/level1/xasum.cpp +++ b/src/routines/level1/xasum.cpp @@ -22,71 +22,61 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/xasum.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xasum<T>::DoAsum(const size_t n, - const Buffer<T> &asum_buffer, const size_t asum_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xasum<T>::DoAsum(const size_t n, + const Buffer<T> &asum_buffer, const size_t asum_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorScalar(1, asum_buffer, asum_offset); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorScalar(1, asum_buffer, asum_offset); // Retrieves the Xasum kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xasum"); - auto kernel2 = Kernel(program, "XasumEpilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer = Buffer<T>(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast<int>(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast<int>(x_offset)); - kernel1.SetArgument(3, static_cast<int>(x_inc)); - kernel1.SetArgument(4, temp_buffer()); - - // Event waiting list - auto eventWaitList = std::vector<Event>(); - - // Launches the main kernel - auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; - auto local1 = std::vector<size_t>{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer()); - kernel2.SetArgument(1, asum_buffer()); - kernel2.SetArgument(2, static_cast<int>(asum_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector<size_t>{db_["WGS2"]}; - auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel1 = Kernel(program, "Xasum"); + auto kernel2 = Kernel(program, "XasumEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer<T>(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast<int>(x_offset)); + kernel1.SetArgument(3, static_cast<int>(x_inc)); + kernel1.SetArgument(4, temp_buffer()); + + // Event waiting list + auto eventWaitList = std::vector<Event>(); + + // Launches the main kernel + auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; + auto local1 = std::vector<size_t>{db_["WGS1"]}; + auto kernelEvent = Event(); + RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, asum_buffer()); + kernel2.SetArgument(2, static_cast<int>(asum_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector<size_t>{db_["WGS2"]}; + auto local2 = std::vector<size_t>{db_["WGS2"]}; + RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= diff --git a/src/routines/level1/xasum.hpp b/src/routines/level1/xasum.hpp index 5a253f4d..0afcc4ff 100644 --- a/src/routines/level1/xasum.hpp +++ b/src/routines/level1/xasum.hpp @@ -28,9 +28,9 @@ class Xasum: public Routine { Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM"); // Templated-precision implementation of the routine - StatusCode DoAsum(const size_t n, - const Buffer<T> &asum_buffer, const size_t asum_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoAsum(const size_t n, + const Buffer<T> &asum_buffer, const size_t asum_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp index 3445e2b5..5436c5b7 100644 --- a/src/routines/level1/xaxpy.cpp +++ b/src/routines/level1/xaxpy.cpp @@ -22,29 +22,26 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xaxpy.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xaxpy<T>::DoAxpy(const size_t n, const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && @@ -55,45 +52,39 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha, auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; // Retrieves the Xaxpy kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, GetRealArg(alpha)); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, GetRealArg(alpha)); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast<int>(x_offset)); - kernel.SetArgument(4, static_cast<int>(x_inc)); - kernel.SetArgument(5, y_buffer()); - kernel.SetArgument(6, static_cast<int>(y_offset)); - kernel.SetArgument(7, static_cast<int>(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, GetRealArg(alpha)); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, GetRealArg(alpha)); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast<int>(x_offset)); + kernel.SetArgument(4, static_cast<int>(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, static_cast<int>(y_offset)); + kernel.SetArgument(7, static_cast<int>(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } } // ================================================================================================= diff --git a/src/routines/level1/xaxpy.hpp b/src/routines/level1/xaxpy.hpp index caac871e..9b30dfaa 100644 --- a/src/routines/level1/xaxpy.hpp +++ b/src/routines/level1/xaxpy.hpp @@ -28,9 +28,9 @@ class Xaxpy: public Routine { Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY"); // Templated-precision implementation of the routine - StatusCode DoAxpy(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoAxpy(const size_t n, const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xcopy.cpp b/src/routines/level1/xcopy.cpp index 673ef349..d86200c0 100644 --- a/src/routines/level1/xcopy.cpp +++ b/src/routines/level1/xcopy.cpp @@ -22,29 +22,26 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xcopy.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xcopy<T>::DoCopy(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xcopy<T>::DoCopy(const size_t n, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && @@ -55,43 +52,37 @@ StatusCode Xcopy<T>::DoCopy(const size_t n, auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy"; // Retrieves the Xcopy kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, static_cast<int>(x_offset)); - kernel.SetArgument(3, static_cast<int>(x_inc)); - kernel.SetArgument(4, y_buffer()); - kernel.SetArgument(5, static_cast<int>(y_offset)); - kernel.SetArgument(6, static_cast<int>(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, static_cast<int>(x_offset)); + kernel.SetArgument(3, static_cast<int>(x_inc)); + kernel.SetArgument(4, y_buffer()); + kernel.SetArgument(5, static_cast<int>(y_offset)); + kernel.SetArgument(6, static_cast<int>(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } } // ================================================================================================= diff --git a/src/routines/level1/xcopy.hpp b/src/routines/level1/xcopy.hpp index 0c424ba3..a6454fcc 100644 --- a/src/routines/level1/xcopy.hpp +++ b/src/routines/level1/xcopy.hpp @@ -28,9 +28,9 @@ class Xcopy: public Routine { Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY"); // Templated-precision implementation of the routine - StatusCode DoCopy(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoCopy(const size_t n, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xdot.cpp b/src/routines/level1/xdot.cpp index bafea157..9d718913 100644 --- a/src/routines/level1/xdot.cpp +++ b/src/routines/level1/xdot.cpp @@ -22,79 +22,68 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/xdot.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xdot<T>::DoDot(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const bool do_conjugate) { +void Xdot<T>::DoDot(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const bool do_conjugate) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorScalar(1, dot_buffer, dot_offset); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); + TestVectorScalar(1, dot_buffer, dot_offset); // Retrieves the Xdot kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xdot"); - auto kernel2 = Kernel(program, "XdotEpilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer = Buffer<T>(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast<int>(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast<int>(x_offset)); - kernel1.SetArgument(3, static_cast<int>(x_inc)); - kernel1.SetArgument(4, y_buffer()); - kernel1.SetArgument(5, static_cast<int>(y_offset)); - kernel1.SetArgument(6, static_cast<int>(y_inc)); - kernel1.SetArgument(7, temp_buffer()); - kernel1.SetArgument(8, static_cast<int>(do_conjugate)); - - // Event waiting list - auto eventWaitList = std::vector<Event>(); - - // Launches the main kernel - auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; - auto local1 = std::vector<size_t>{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer()); - kernel2.SetArgument(1, dot_buffer()); - kernel2.SetArgument(2, static_cast<int>(dot_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector<size_t>{db_["WGS2"]}; - auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel1 = Kernel(program, "Xdot"); + auto kernel2 = Kernel(program, "XdotEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer<T>(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast<int>(x_offset)); + kernel1.SetArgument(3, static_cast<int>(x_inc)); + kernel1.SetArgument(4, y_buffer()); + kernel1.SetArgument(5, static_cast<int>(y_offset)); + kernel1.SetArgument(6, static_cast<int>(y_inc)); + kernel1.SetArgument(7, temp_buffer()); + kernel1.SetArgument(8, static_cast<int>(do_conjugate)); + + // Event waiting list + auto eventWaitList = std::vector<Event>(); + + // Launches the main kernel + auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; + auto local1 = std::vector<size_t>{db_["WGS1"]}; + auto kernelEvent = Event(); + RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, dot_buffer()); + kernel2.SetArgument(2, static_cast<int>(dot_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector<size_t>{db_["WGS2"]}; + auto local2 = std::vector<size_t>{db_["WGS2"]}; + RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= diff --git a/src/routines/level1/xdot.hpp b/src/routines/level1/xdot.hpp index 02c1efaa..a4c9dfa0 100644 --- a/src/routines/level1/xdot.hpp +++ b/src/routines/level1/xdot.hpp @@ -28,11 +28,11 @@ class Xdot: public Routine { Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT"); // Templated-precision implementation of the routine - StatusCode DoDot(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const bool do_conjugate = false); + void DoDot(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const bool do_conjugate = false); }; // ================================================================================================= diff --git a/src/routines/level1/xdotc.cpp b/src/routines/level1/xdotc.cpp index 27cf2bab..5a4e939a 100644 --- a/src/routines/level1/xdotc.cpp +++ b/src/routines/level1/xdotc.cpp @@ -29,14 +29,14 @@ Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xdotc<T>::DoDotc(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { - return DoDot(n, dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - true); +void Xdotc<T>::DoDotc(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + DoDot(n, dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + true); } // ================================================================================================= diff --git a/src/routines/level1/xdotc.hpp b/src/routines/level1/xdotc.hpp index b8cbdaf5..ab7465f5 100644 --- a/src/routines/level1/xdotc.hpp +++ b/src/routines/level1/xdotc.hpp @@ -31,10 +31,10 @@ class Xdotc: public Xdot<T> { Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC"); // Templated-precision implementation of the routine - StatusCode DoDotc(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoDotc(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xdotu.cpp b/src/routines/level1/xdotu.cpp index 0bce70b7..b9d8bcef 100644 --- a/src/routines/level1/xdotu.cpp +++ b/src/routines/level1/xdotu.cpp @@ -28,14 +28,14 @@ Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xdotu<T>::DoDotu(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { - return DoDot(n, dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - false); +void Xdotu<T>::DoDotu(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + DoDot(n, dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + false); } // ================================================================================================= diff --git a/src/routines/level1/xdotu.hpp b/src/routines/level1/xdotu.hpp index b3f73086..cad91c58 100644 --- a/src/routines/level1/xdotu.hpp +++ b/src/routines/level1/xdotu.hpp @@ -31,10 +31,10 @@ class Xdotu: public Xdot<T> { Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU"); // Templated-precision implementation of the routine - StatusCode DoDotu(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoDotu(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xmax.hpp b/src/routines/level1/xmax.hpp index 5a0236f2..2b7a5ae7 100644 --- a/src/routines/level1/xmax.hpp +++ b/src/routines/level1/xmax.hpp @@ -35,10 +35,10 @@ class Xmax: public Xamax<T> { // Forwards to the regular absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. - StatusCode DoMax(const size_t n, - const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { - return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc); + void DoMax(const size_t n, + const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc); } }; diff --git a/src/routines/level1/xmin.hpp b/src/routines/level1/xmin.hpp index 6befec64..47a195ea 100644 --- a/src/routines/level1/xmin.hpp +++ b/src/routines/level1/xmin.hpp @@ -35,10 +35,10 @@ class Xmin: public Xamax<T> { // Forwards to the regular max-absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. - StatusCode DoMin(const size_t n, - const Buffer<unsigned int> &imin_buffer, const size_t imin_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { - return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); + void DoMin(const size_t n, + const Buffer<unsigned int> &imin_buffer, const size_t imin_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); } }; diff --git a/src/routines/level1/xnrm2.cpp b/src/routines/level1/xnrm2.cpp index 97615d8b..373820a4 100644 --- a/src/routines/level1/xnrm2.cpp +++ b/src/routines/level1/xnrm2.cpp @@ -22,71 +22,61 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/xnrm2.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xnrm2<T>::DoNrm2(const size_t n, - const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xnrm2<T>::DoNrm2(const size_t n, + const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorScalar(1, nrm2_buffer, nrm2_offset); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorScalar(1, nrm2_buffer, nrm2_offset); // Retrieves the Xnrm2 kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xnrm2"); - auto kernel2 = Kernel(program, "Xnrm2Epilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer = Buffer<T>(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast<int>(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast<int>(x_offset)); - kernel1.SetArgument(3, static_cast<int>(x_inc)); - kernel1.SetArgument(4, temp_buffer()); - - // Event waiting list - auto eventWaitList = std::vector<Event>(); - - // Launches the main kernel - auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; - auto local1 = std::vector<size_t>{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer()); - kernel2.SetArgument(1, nrm2_buffer()); - kernel2.SetArgument(2, static_cast<int>(nrm2_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector<size_t>{db_["WGS2"]}; - auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel1 = Kernel(program, "Xnrm2"); + auto kernel2 = Kernel(program, "Xnrm2Epilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer<T>(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast<int>(x_offset)); + kernel1.SetArgument(3, static_cast<int>(x_inc)); + kernel1.SetArgument(4, temp_buffer()); + + // Event waiting list + auto eventWaitList = std::vector<Event>(); + + // Launches the main kernel + auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; + auto local1 = std::vector<size_t>{db_["WGS1"]}; + auto kernelEvent = Event(); + RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, nrm2_buffer()); + kernel2.SetArgument(2, static_cast<int>(nrm2_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector<size_t>{db_["WGS2"]}; + auto local2 = std::vector<size_t>{db_["WGS2"]}; + RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= diff --git a/src/routines/level1/xnrm2.hpp b/src/routines/level1/xnrm2.hpp index 7baf07f5..3183ce24 100644 --- a/src/routines/level1/xnrm2.hpp +++ b/src/routines/level1/xnrm2.hpp @@ -28,9 +28,9 @@ class Xnrm2: public Routine { Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2"); // Templated-precision implementation of the routine - StatusCode DoNrm2(const size_t n, - const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoNrm2(const size_t n, + const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp index bcc43c3b..17410f01 100644 --- a/src/routines/level1/xscal.cpp +++ b/src/routines/level1/xscal.cpp @@ -22,26 +22,24 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xscal.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xscal<T>::DoScal(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xscal<T>::DoScal(const size_t n, const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vector for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && @@ -51,41 +49,35 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha, auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal"; // Retrieves the Xscal kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, alpha); - kernel.SetArgument(2, x_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, alpha); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast<int>(x_offset)); - kernel.SetArgument(4, static_cast<int>(x_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast<int>(x_offset)); + kernel.SetArgument(4, static_cast<int>(x_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } } // ================================================================================================= diff --git a/src/routines/level1/xscal.hpp b/src/routines/level1/xscal.hpp index 6c585cb2..02c847cc 100644 --- a/src/routines/level1/xscal.hpp +++ b/src/routines/level1/xscal.hpp @@ -28,8 +28,8 @@ class Xscal: public Routine { Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL"); // Templated-precision implementation of the routine - StatusCode DoScal(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoScal(const size_t n, const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xsum.hpp b/src/routines/level1/xsum.hpp index 84e20bea..a69d6511 100644 --- a/src/routines/level1/xsum.hpp +++ b/src/routines/level1/xsum.hpp @@ -35,10 +35,10 @@ class Xsum: public Xasum<T> { // Forwards to the regular absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. - StatusCode DoSum(const size_t n, - const Buffer<T> &sum_buffer, const size_t sum_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { - return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc); + void DoSum(const size_t n, + const Buffer<T> &sum_buffer, const size_t sum_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc); } }; diff --git a/src/routines/level1/xswap.cpp b/src/routines/level1/xswap.cpp index 03907cbd..c9b97dc9 100644 --- a/src/routines/level1/xswap.cpp +++ b/src/routines/level1/xswap.cpp @@ -22,29 +22,26 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, { #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xswap.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xswap<T>::DoSwap(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xswap<T>::DoSwap(const size_t n, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && @@ -55,43 +52,37 @@ StatusCode Xswap<T>::DoSwap(const size_t n, auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap"; // Retrieves the Xswap kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, static_cast<int>(x_offset)); - kernel.SetArgument(3, static_cast<int>(x_inc)); - kernel.SetArgument(4, y_buffer()); - kernel.SetArgument(5, static_cast<int>(y_offset)); - kernel.SetArgument(6, static_cast<int>(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, static_cast<int>(x_offset)); + kernel.SetArgument(3, static_cast<int>(x_inc)); + kernel.SetArgument(4, y_buffer()); + kernel.SetArgument(5, static_cast<int>(y_offset)); + kernel.SetArgument(6, static_cast<int>(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } } // ================================================================================================= diff --git a/src/routines/level1/xswap.hpp b/src/routines/level1/xswap.hpp index 4f9ea36d..eadf58e5 100644 --- a/src/routines/level1/xswap.hpp +++ b/src/routines/level1/xswap.hpp @@ -28,9 +28,9 @@ class Xswap: public Routine { Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP"); // Templated-precision implementation of the routine - StatusCode DoSwap(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoSwap(const size_t n, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xgbmv.cpp b/src/routines/level2/xgbmv.cpp index ea4f001c..e80b9a96 100644 --- a/src/routines/level2/xgbmv.cpp +++ b/src/routines/level2/xgbmv.cpp @@ -29,13 +29,13 @@ Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Reverses the upper and lower band count auto rotated = (layout == Layout::kRowMajor); @@ -46,13 +46,13 @@ StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose, // The specific hermitian matrix-accesses are implemented in the kernel guarded by the // ROUTINE_GBMV define. bool fast_kernels = false; - return MatVec(layout, a_transpose, - m, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - 0, false, kl_real, ku_real); + MatVec(layout, a_transpose, + m, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + 0, false, kl_real, ku_real); } // ================================================================================================= diff --git a/src/routines/level2/xgbmv.hpp b/src/routines/level2/xgbmv.hpp index 686ab642..e5f670ec 100644 --- a/src/routines/level2/xgbmv.hpp +++ b/src/routines/level2/xgbmv.hpp @@ -33,13 +33,13 @@ class Xgbmv: public Xgemv<T> { Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV"); // Templated-precision implementation of the routine - StatusCode DoGbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index 4e32ba41..7b4c2e8f 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -22,52 +22,51 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, { #include "../../kernels/level2/xgemv.opencl" #include "../../kernels/level2/xgemv_fast.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Performs the matrix-vector multiplication - return MatVec(layout, a_transpose, - m, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - true, true, - 0, false, 0, 0); // N/A for this routine + MatVec(layout, a_transpose, + m, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + true, true, + 0, false, 0, 0); // N/A for this routine } // ================================================================================================= // The generic implementation, also suited for other (non general) matrix-vector multiplications template <typename T> -StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - bool fast_kernel, bool fast_kernel_rot, - const size_t parameter, const bool packed, - const size_t kl, const size_t ku) { +void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + bool fast_kernel, bool fast_kernel_rot, + const size_t parameter, const bool packed, + const size_t kl, const size_t ku) { // Makes sure all dimensions are larger than zero - if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } + if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrix has an alternative layout (row or column-major). auto a_altlayout = (layout == Layout::kRowMajor); @@ -91,14 +90,10 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, auto a_conjugate = (a_transpose == Transpose::kConjugate); // Tests the matrix and the vectors for validity - auto status = StatusCode::kSuccess; - if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } - else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); } - if (ErrorIn(status)) { return status; } - status = TestVectorX(n_real, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(m_real, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + if (packed) { TestMatrixAP(n, a_buffer, a_offset); } + else { TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); } + TestVectorX(n_real, x_buffer, x_offset, x_inc); + TestVectorY(m_real, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && @@ -127,39 +122,33 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, } // Retrieves the Xgemv kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(m_real)); - kernel.SetArgument(1, static_cast<int>(n_real)); - kernel.SetArgument(2, GetRealArg(alpha)); - kernel.SetArgument(3, GetRealArg(beta)); - kernel.SetArgument(4, static_cast<int>(a_rotated)); - kernel.SetArgument(5, a_buffer()); - kernel.SetArgument(6, static_cast<int>(a_offset)); - kernel.SetArgument(7, static_cast<int>(a_ld)); - kernel.SetArgument(8, x_buffer()); - kernel.SetArgument(9, static_cast<int>(x_offset)); - kernel.SetArgument(10, static_cast<int>(x_inc)); - kernel.SetArgument(11, y_buffer()); - kernel.SetArgument(12, static_cast<int>(y_offset)); - kernel.SetArgument(13, static_cast<int>(y_inc)); - kernel.SetArgument(14, static_cast<int>(a_conjugate)); - kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm - kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices - kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices - - // Launches the kernel - auto global = std::vector<size_t>{global_size}; - auto local = std::vector<size_t>{local_size}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(m_real)); + kernel.SetArgument(1, static_cast<int>(n_real)); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); + kernel.SetArgument(4, static_cast<int>(a_rotated)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast<int>(a_offset)); + kernel.SetArgument(7, static_cast<int>(a_ld)); + kernel.SetArgument(8, x_buffer()); + kernel.SetArgument(9, static_cast<int>(x_offset)); + kernel.SetArgument(10, static_cast<int>(x_inc)); + kernel.SetArgument(11, y_buffer()); + kernel.SetArgument(12, static_cast<int>(y_offset)); + kernel.SetArgument(13, static_cast<int>(y_inc)); + kernel.SetArgument(14, static_cast<int>(a_conjugate)); + kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm + kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices + kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices + + // Launches the kernel + auto global = std::vector<size_t>{global_size}; + auto local = std::vector<size_t>{local_size}; + RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= diff --git a/src/routines/level2/xgemv.hpp b/src/routines/level2/xgemv.hpp index e9afec8d..1e1fa726 100644 --- a/src/routines/level2/xgemv.hpp +++ b/src/routines/level2/xgemv.hpp @@ -28,25 +28,25 @@ class Xgemv: public Routine { Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV"); // Templated-precision implementation of the routine - StatusCode DoGemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoGemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); // Generic version used also for other matrix-vector multiplications - StatusCode MatVec(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - bool fast_kernel, bool fast_kernel_rot, - const size_t parameter, const bool packed, - const size_t kl, const size_t ku); + void MatVec(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + bool fast_kernel, bool fast_kernel_rot, + const size_t parameter, const bool packed, + const size_t kl, const size_t ku); }; // ================================================================================================= diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp index 29cffe0c..d16ebd11 100644 --- a/src/routines/level2/xger.cpp +++ b/src/routines/level2/xger.cpp @@ -22,26 +22,25 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, { #include "../../kernels/level2/level2.opencl" #include "../../kernels/level2/xger.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xger<T>::DoGer(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { +void Xger<T>::DoGer(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { // Makes sure all dimensions are larger than zero - if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } + if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrix has an alternative layout (row or column-major). const auto a_is_rowmajor = (layout == Layout::kRowMajor); @@ -49,44 +48,35 @@ StatusCode Xger<T>::DoGer(const Layout layout, const auto a_two = (a_is_rowmajor) ? m : n; // Tests the matrix and the vectors for validity - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestVectorX(m, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + TestVectorX(m, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Retrieves the kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xger"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(a_one)); - kernel.SetArgument(1, static_cast<int>(a_two)); - kernel.SetArgument(2, GetRealArg(alpha)); - kernel.SetArgument(3, x_buffer()); - kernel.SetArgument(4, static_cast<int>(x_offset)); - kernel.SetArgument(5, static_cast<int>(x_inc)); - kernel.SetArgument(6, y_buffer()); - kernel.SetArgument(7, static_cast<int>(y_offset)); - kernel.SetArgument(8, static_cast<int>(y_inc)); - kernel.SetArgument(9, a_buffer()); - kernel.SetArgument(10, static_cast<int>(a_offset)); - kernel.SetArgument(11, static_cast<int>(a_ld)); - kernel.SetArgument(12, static_cast<int>(a_is_rowmajor)); - - // Launches the kernel - auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]); - auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]); - auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled}; - auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, "Xger"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(a_one)); + kernel.SetArgument(1, static_cast<int>(a_two)); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, x_buffer()); + kernel.SetArgument(4, static_cast<int>(x_offset)); + kernel.SetArgument(5, static_cast<int>(x_inc)); + kernel.SetArgument(6, y_buffer()); + kernel.SetArgument(7, static_cast<int>(y_offset)); + kernel.SetArgument(8, static_cast<int>(y_inc)); + kernel.SetArgument(9, a_buffer()); + kernel.SetArgument(10, static_cast<int>(a_offset)); + kernel.SetArgument(11, static_cast<int>(a_ld)); + kernel.SetArgument(12, static_cast<int>(a_is_rowmajor)); + + // Launches the kernel + auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]); + auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]); + auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled}; + auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; + RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= diff --git a/src/routines/level2/xger.hpp b/src/routines/level2/xger.hpp index 3c6abe44..fbbb07a1 100644 --- a/src/routines/level2/xger.hpp +++ b/src/routines/level2/xger.hpp @@ -28,12 +28,12 @@ class Xger: public Routine { Xger(Queue &queue, EventPointer event, const std::string &name = "GER"); // Templated-precision implementation of the routine - StatusCode DoGer(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); + void DoGer(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= diff --git a/src/routines/level2/xgerc.cpp b/src/routines/level2/xgerc.cpp index d9feda97..4fa2e2a8 100644 --- a/src/routines/level2/xgerc.cpp +++ b/src/routines/level2/xgerc.cpp @@ -28,19 +28,19 @@ Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xgerc<T>::DoGerc(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { +void Xgerc<T>::DoGerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { // Regular Ger operation on complex data, plus conjugation in the kernel guarded by the // ROUTINE_GERC guard. - return DoGer(layout, m, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld); + DoGer(layout, m, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld); } // ================================================================================================= diff --git a/src/routines/level2/xgerc.hpp b/src/routines/level2/xgerc.hpp index f1d04dfd..2d61f2b7 100644 --- a/src/routines/level2/xgerc.hpp +++ b/src/routines/level2/xgerc.hpp @@ -31,12 +31,12 @@ class Xgerc: public Xger<T> { Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC"); // Templated-precision implementation of the routine - StatusCode DoGerc(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); + void DoGerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= diff --git a/src/routines/level2/xgeru.cpp b/src/routines/level2/xgeru.cpp index da9e91c2..c77e69c5 100644 --- a/src/routines/level2/xgeru.cpp +++ b/src/routines/level2/xgeru.cpp @@ -28,18 +28,18 @@ Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xgeru<T>::DoGeru(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { +void Xgeru<T>::DoGeru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { // Regular Ger operation on complex data - return DoGer(layout, m, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld); + DoGer(layout, m, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld); } // ================================================================================================= diff --git a/src/routines/level2/xgeru.hpp b/src/routines/level2/xgeru.hpp index fb50e917..4cae6b58 100644 --- a/src/routines/level2/xgeru.hpp +++ b/src/routines/level2/xgeru.hpp @@ -31,12 +31,12 @@ class Xgeru: public Xger<T> { Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU"); // Templated-precision implementation of the routine - StatusCode DoGeru(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); + void DoGeru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= diff --git a/src/routines/level2/xhbmv.cpp b/src/routines/level2/xhbmv.cpp index f6c0e3c4..c7c9ed9d 100644 --- a/src/routines/level2/xhbmv.cpp +++ b/src/routines/level2/xhbmv.cpp @@ -29,13 +29,13 @@ Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -45,13 +45,13 @@ StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle, // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the // ROUTINE_HBMV define. bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, false, k, 0); + MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, k, 0); } // ================================================================================================= diff --git a/src/routines/level2/xhbmv.hpp b/src/routines/level2/xhbmv.hpp index d668eb88..76d3c91e 100644 --- a/src/routines/level2/xhbmv.hpp +++ b/src/routines/level2/xhbmv.hpp @@ -33,13 +33,13 @@ class Xhbmv: public Xgemv<T> { Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV"); // Templated-precision implementation of the routine - StatusCode DoHbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoHbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xhemv.cpp b/src/routines/level2/xhemv.cpp index 2cbcf7b4..209ff654 100644 --- a/src/routines/level2/xhemv.cpp +++ b/src/routines/level2/xhemv.cpp @@ -29,13 +29,13 @@ Xhemv<T>::Xhemv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -45,13 +45,13 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle, // The specific hermitian matrix-accesses are implemented in the kernel guarded by the // ROUTINE_HEMV define. bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, false, 0, 0); + MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xhemv.hpp b/src/routines/level2/xhemv.hpp index 8e062fd3..20d2df22 100644 --- a/src/routines/level2/xhemv.hpp +++ b/src/routines/level2/xhemv.hpp @@ -33,13 +33,13 @@ class Xhemv: public Xgemv<T> { Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV"); // Templated-precision implementation of the routine - StatusCode DoHemv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoHemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp index 6dd95938..6c334e63 100644 --- a/src/routines/level2/xher.cpp +++ b/src/routines/level2/xher.cpp @@ -21,11 +21,10 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T, typename U> Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, { #include "../../kernels/level2/level2.opencl" #include "../../kernels/level2/xher.opencl" - ; + }) { } // ================================================================================================= @@ -41,15 +40,15 @@ template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; } // The main routine template <typename T, typename U> -StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, - const size_t n, - const U alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const bool packed) { +void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, + const size_t n, + const U alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const bool packed) { // Makes sure the dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // The data is either in the upper or lower triangle const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -57,47 +56,38 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, const auto is_rowmajor = (layout == Layout::kRowMajor); // Tests the matrix and the vectors for validity - auto status = StatusCode::kSuccess; - if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } - else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); } - if (ErrorIn(status)) { return status; } - status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } + if (packed) { TestMatrixAP(n, a_buffer, a_offset); } + else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); } + TestVectorX(n, x_buffer, x_offset, x_inc); // If alpha is zero an update is not required - if (alpha == U{0}) { return StatusCode::kSuccess; } + if (alpha == U{0}) { return; } // Creates a matching version of alpha const auto matching_alpha = GetAlpha(alpha); // Retrieves the kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xher"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, GetRealArg(matching_alpha)); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast<int>(x_offset)); - kernel.SetArgument(4, static_cast<int>(x_inc)); - kernel.SetArgument(5, a_buffer()); - kernel.SetArgument(6, static_cast<int>(a_offset)); - kernel.SetArgument(7, static_cast<int>(a_ld)); - kernel.SetArgument(8, static_cast<int>(is_upper)); - kernel.SetArgument(9, static_cast<int>(is_rowmajor)); - - // Launches the kernel - auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); - auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); - auto global = std::vector<size_t>{global_one, global_two}; - auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, "Xher"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, GetRealArg(matching_alpha)); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast<int>(x_offset)); + kernel.SetArgument(4, static_cast<int>(x_inc)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast<int>(a_offset)); + kernel.SetArgument(7, static_cast<int>(a_ld)); + kernel.SetArgument(8, static_cast<int>(is_upper)); + kernel.SetArgument(9, static_cast<int>(is_rowmajor)); + + // Launches the kernel + auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); + auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); + auto global = std::vector<size_t>{global_one, global_two}; + auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; + RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= diff --git a/src/routines/level2/xher.hpp b/src/routines/level2/xher.hpp index 9ff6bf3f..70a30bda 100644 --- a/src/routines/level2/xher.hpp +++ b/src/routines/level2/xher.hpp @@ -31,12 +31,12 @@ class Xher: public Routine { T GetAlpha(const U alpha); // Templated-precision implementation of the routine - StatusCode DoHer(const Layout layout, const Triangle triangle, - const size_t n, - const U alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const bool packed = false); + void DoHer(const Layout layout, const Triangle triangle, + const size_t n, + const U alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const bool packed = false); }; // ================================================================================================= diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp index 3d57a9b9..11e2c871 100644 --- a/src/routines/level2/xher2.cpp +++ b/src/routines/level2/xher2.cpp @@ -21,27 +21,26 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, { #include "../../kernels/level2/level2.opencl" #include "../../kernels/level2/xher2.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const bool packed) { +void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const bool packed) { // Makes sure the dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // The data is either in the upper or lower triangle const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -49,46 +48,36 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, const auto is_rowmajor = (layout == Layout::kRowMajor); // Tests the matrix and the vectors for validity - auto status = StatusCode::kSuccess; - if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } - else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); } - if (ErrorIn(status)) { return status; } - status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + if (packed) { TestMatrixAP(n, a_buffer, a_offset); } + else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Retrieves the kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xher2"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, GetRealArg(alpha)); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast<int>(x_offset)); - kernel.SetArgument(4, static_cast<int>(x_inc)); - kernel.SetArgument(5, y_buffer()); - kernel.SetArgument(6, static_cast<int>(y_offset)); - kernel.SetArgument(7, static_cast<int>(y_inc)); - kernel.SetArgument(8, a_buffer()); - kernel.SetArgument(9, static_cast<int>(a_offset)); - kernel.SetArgument(10, static_cast<int>(a_ld)); - kernel.SetArgument(11, static_cast<int>(is_upper)); - kernel.SetArgument(12, static_cast<int>(is_rowmajor)); - - // Launches the kernel - auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); - auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); - auto global = std::vector<size_t>{global_one, global_two}; - auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, "Xher2"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, GetRealArg(alpha)); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast<int>(x_offset)); + kernel.SetArgument(4, static_cast<int>(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, static_cast<int>(y_offset)); + kernel.SetArgument(7, static_cast<int>(y_inc)); + kernel.SetArgument(8, a_buffer()); + kernel.SetArgument(9, static_cast<int>(a_offset)); + kernel.SetArgument(10, static_cast<int>(a_ld)); + kernel.SetArgument(11, static_cast<int>(is_upper)); + kernel.SetArgument(12, static_cast<int>(is_rowmajor)); + + // Launches the kernel + auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); + auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); + auto global = std::vector<size_t>{global_one, global_two}; + auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; + RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= diff --git a/src/routines/level2/xher2.hpp b/src/routines/level2/xher2.hpp index 8c53c047..dcb2ecb7 100644 --- a/src/routines/level2/xher2.hpp +++ b/src/routines/level2/xher2.hpp @@ -28,13 +28,13 @@ class Xher2: public Routine { Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2"); // Templated-precision implementation of the routine - StatusCode DoHer2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const bool packed = false); + void DoHer2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const bool packed = false); }; // ================================================================================================= diff --git a/src/routines/level2/xhpmv.cpp b/src/routines/level2/xhpmv.cpp index e6f82b34..70a0ab0d 100644 --- a/src/routines/level2/xhpmv.cpp +++ b/src/routines/level2/xhpmv.cpp @@ -29,13 +29,13 @@ Xhpmv<T>::Xhpmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -45,13 +45,13 @@ StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle, // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the // ROUTINE_HPMV define. bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - ap_buffer, ap_offset, n, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, true, 0, 0); + MatVec(layout, Transpose::kNo, + n, n, alpha, + ap_buffer, ap_offset, n, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, true, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xhpmv.hpp b/src/routines/level2/xhpmv.hpp index b11192f9..13a6277c 100644 --- a/src/routines/level2/xhpmv.hpp +++ b/src/routines/level2/xhpmv.hpp @@ -33,13 +33,13 @@ class Xhpmv: public Xgemv<T> { Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV"); // Templated-precision implementation of the routine - StatusCode DoHpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoHpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xhpr.cpp b/src/routines/level2/xhpr.cpp index 225ebfe5..7e517c59 100644 --- a/src/routines/level2/xhpr.cpp +++ b/src/routines/level2/xhpr.cpp @@ -28,17 +28,17 @@ Xhpr<T,U>::Xhpr(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T, typename U> -StatusCode Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle, - const size_t n, - const U alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset) { +void Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle, + const size_t n, + const U alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset) { // Specific Xhpr functionality is implemented in the kernel using defines - return DoHer(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, n, - true); // packed matrix + DoHer(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, n, + true); // packed matrix } // ================================================================================================= diff --git a/src/routines/level2/xhpr.hpp b/src/routines/level2/xhpr.hpp index 37801c68..6ebc220e 100644 --- a/src/routines/level2/xhpr.hpp +++ b/src/routines/level2/xhpr.hpp @@ -31,11 +31,11 @@ class Xhpr: public Xher<T,U> { Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR"); // Templated-precision implementation of the routine - StatusCode DoHpr(const Layout layout, const Triangle triangle, - const size_t n, - const U alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset); + void DoHpr(const Layout layout, const Triangle triangle, + const size_t n, + const U alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset); }; // ================================================================================================= diff --git a/src/routines/level2/xhpr2.cpp b/src/routines/level2/xhpr2.cpp index 85f9d3f9..35daa365 100644 --- a/src/routines/level2/xhpr2.cpp +++ b/src/routines/level2/xhpr2.cpp @@ -28,19 +28,19 @@ Xhpr2<T>::Xhpr2(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset) { +void Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset) { // Specific Xhpr2 functionality is implemented in the kernel using defines - return DoHer2(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, n, - true); // packed matrix + DoHer2(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, n, + true); // packed matrix } // ================================================================================================= diff --git a/src/routines/level2/xhpr2.hpp b/src/routines/level2/xhpr2.hpp index d66dce55..f344fd48 100644 --- a/src/routines/level2/xhpr2.hpp +++ b/src/routines/level2/xhpr2.hpp @@ -31,12 +31,12 @@ class Xhpr2: public Xher2<T> { Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2"); // Templated-precision implementation of the routine - StatusCode DoHpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset); + void DoHpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset); }; // ================================================================================================= diff --git a/src/routines/level2/xsbmv.cpp b/src/routines/level2/xsbmv.cpp index 28730899..e47430d1 100644 --- a/src/routines/level2/xsbmv.cpp +++ b/src/routines/level2/xsbmv.cpp @@ -29,13 +29,13 @@ Xsbmv<T>::Xsbmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -45,13 +45,13 @@ StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle, // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the // ROUTINE_SBMV define. bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, false, k, 0); + MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, k, 0); } // ================================================================================================= diff --git a/src/routines/level2/xsbmv.hpp b/src/routines/level2/xsbmv.hpp index 16c5e9a8..a4542f49 100644 --- a/src/routines/level2/xsbmv.hpp +++ b/src/routines/level2/xsbmv.hpp @@ -33,13 +33,13 @@ class Xsbmv: public Xgemv<T> { Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV"); // Templated-precision implementation of the routine - StatusCode DoSbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoSbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xspmv.cpp b/src/routines/level2/xspmv.cpp index f6651012..bf1a49e1 100644 --- a/src/routines/level2/xspmv.cpp +++ b/src/routines/level2/xspmv.cpp @@ -29,13 +29,13 @@ Xspmv<T>::Xspmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -45,13 +45,13 @@ StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle, // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the // ROUTINE_SPMV define. bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - ap_buffer, ap_offset, n, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, true, 0, 0); + MatVec(layout, Transpose::kNo, + n, n, alpha, + ap_buffer, ap_offset, n, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, true, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xspmv.hpp b/src/routines/level2/xspmv.hpp index a0c69b85..94caa4ac 100644 --- a/src/routines/level2/xspmv.hpp +++ b/src/routines/level2/xspmv.hpp @@ -33,13 +33,13 @@ class Xspmv: public Xgemv<T> { Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV"); // Templated-precision implementation of the routine - StatusCode DoSpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoSpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xspr.cpp b/src/routines/level2/xspr.cpp index a75fe9c3..56791a7b 100644 --- a/src/routines/level2/xspr.cpp +++ b/src/routines/level2/xspr.cpp @@ -28,17 +28,17 @@ Xspr<T>::Xspr(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset) { +void Xspr<T>::DoSpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset) { // Specific Xspr functionality is implemented in the kernel using defines - return DoHer(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, n, - true); // packed matrix + DoHer(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, n, + true); // packed matrix } // ================================================================================================= diff --git a/src/routines/level2/xspr.hpp b/src/routines/level2/xspr.hpp index 6468c736..760a2ddb 100644 --- a/src/routines/level2/xspr.hpp +++ b/src/routines/level2/xspr.hpp @@ -31,11 +31,11 @@ class Xspr: public Xher<T,T> { Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR"); // Templated-precision implementation of the routine - StatusCode DoSpr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset); + void DoSpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset); }; // ================================================================================================= diff --git a/src/routines/level2/xspr2.cpp b/src/routines/level2/xspr2.cpp index c39a2eb4..8d0432c2 100644 --- a/src/routines/level2/xspr2.cpp +++ b/src/routines/level2/xspr2.cpp @@ -28,19 +28,19 @@ Xspr2<T>::Xspr2(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset) { +void Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset) { // Specific Xspr2 functionality is implemented in the kernel using defines - return DoHer2(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, n, - true); // packed matrix + DoHer2(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, n, + true); // packed matrix } // ================================================================================================= diff --git a/src/routines/level2/xspr2.hpp b/src/routines/level2/xspr2.hpp index 693c56a1..9f03f768 100644 --- a/src/routines/level2/xspr2.hpp +++ b/src/routines/level2/xspr2.hpp @@ -31,12 +31,12 @@ class Xspr2: public Xher2<T> { Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2"); // Templated-precision implementation of the routine - StatusCode DoSpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset); + void DoSpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &ap_buffer, const size_t ap_offset); }; // ================================================================================================= diff --git a/src/routines/level2/xsymv.cpp b/src/routines/level2/xsymv.cpp index 648d2a3e..86bb66b8 100644 --- a/src/routines/level2/xsymv.cpp +++ b/src/routines/level2/xsymv.cpp @@ -29,13 +29,13 @@ Xsymv<T>::Xsymv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -45,13 +45,13 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle, // The specific symmetric matrix-accesses are implemented in the kernel guarded by the // ROUTINE_SYMV define. bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, false, 0, 0); + MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xsymv.hpp b/src/routines/level2/xsymv.hpp index 67815f2f..3945802f 100644 --- a/src/routines/level2/xsymv.hpp +++ b/src/routines/level2/xsymv.hpp @@ -33,13 +33,13 @@ class Xsymv: public Xgemv<T> { Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV"); // Templated-precision implementation of the routine - StatusCode DoSymv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoSymv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xsyr.cpp b/src/routines/level2/xsyr.cpp index 758d8f8f..64c2dc74 100644 --- a/src/routines/level2/xsyr.cpp +++ b/src/routines/level2/xsyr.cpp @@ -28,16 +28,16 @@ Xsyr<T>::Xsyr(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { +void Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { // Specific Xsyr functionality is implemented in the kernel using defines - return DoHer(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld); + DoHer(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld); } // ================================================================================================= diff --git a/src/routines/level2/xsyr.hpp b/src/routines/level2/xsyr.hpp index 20393454..a23ff80f 100644 --- a/src/routines/level2/xsyr.hpp +++ b/src/routines/level2/xsyr.hpp @@ -31,11 +31,11 @@ class Xsyr: public Xher<T,T> { Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR"); // Templated-precision implementation of the routine - StatusCode DoSyr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); + void DoSyr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= diff --git a/src/routines/level2/xsyr2.cpp b/src/routines/level2/xsyr2.cpp index 6f43b219..38ca9d69 100644 --- a/src/routines/level2/xsyr2.cpp +++ b/src/routines/level2/xsyr2.cpp @@ -28,18 +28,18 @@ Xsyr2<T>::Xsyr2(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { +void Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) { // Specific Xsyr2 functionality is implemented in the kernel using defines - return DoHer2(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld); + DoHer2(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld); } // ================================================================================================= diff --git a/src/routines/level2/xsyr2.hpp b/src/routines/level2/xsyr2.hpp index 1a8dcbe8..5a8d8eb4 100644 --- a/src/routines/level2/xsyr2.hpp +++ b/src/routines/level2/xsyr2.hpp @@ -31,12 +31,12 @@ class Xsyr2: public Xher2<T> { Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2"); // Templated-precision implementation of the routine - StatusCode DoSyr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); + void DoSyr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); }; // ================================================================================================= diff --git a/src/routines/level2/xtbmv.cpp b/src/routines/level2/xtbmv.cpp index e315c544..f4a58ed2 100644 --- a/src/routines/level2/xtbmv.cpp +++ b/src/routines/level2/xtbmv.cpp @@ -29,17 +29,15 @@ Xtbmv<T>::Xtbmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Creates a copy of X: a temporary scratch buffer auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); - try { - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); - } catch (...) { } // Continues: error-code is returned in MatVec + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -52,20 +50,22 @@ StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle, // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the // ROUTINE_TBMV define. auto fast_kernels = false; - auto status = MatVec(layout, a_transpose, - n, n, static_cast<T>(1), - a_buffer, a_offset, a_ld, - scratch_buffer, x_offset, x_inc, static_cast<T>(0), - x_buffer, x_offset, x_inc, - fast_kernels, fast_kernels, - parameter, false, k, 0); - - // Returns the proper error code (renames vector Y to X) - switch(status) { - case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; - case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; - case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; - default: return status; + try { + MatVec(layout, a_transpose, + n, n, static_cast<T>(1), + a_buffer, a_offset, a_ld, + scratch_buffer, x_offset, x_inc, static_cast<T>(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, false, k, 0); + } catch (BLASError &e) { + // Returns the proper error code (renames vector Y to X) + switch (e.status()) { + case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details()); + case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details()); + case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details()); + default: throw; + } } } diff --git a/src/routines/level2/xtbmv.hpp b/src/routines/level2/xtbmv.hpp index 389e9705..abd12db6 100644 --- a/src/routines/level2/xtbmv.hpp +++ b/src/routines/level2/xtbmv.hpp @@ -35,11 +35,11 @@ class Xtbmv: public Xgemv<T> { Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV"); // Templated-precision implementation of the routine - StatusCode DoTbmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoTbmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xtpmv.cpp b/src/routines/level2/xtpmv.cpp index 46811089..c0d26699 100644 --- a/src/routines/level2/xtpmv.cpp +++ b/src/routines/level2/xtpmv.cpp @@ -29,17 +29,15 @@ Xtpmv<T>::Xtpmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Creates a copy of X: a temporary scratch buffer auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); - try { - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); - } catch (...) { } // Continues: error-code is returned in MatVec + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -52,20 +50,22 @@ StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle, // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the // ROUTINE_TPMV define. auto fast_kernels = false; - auto status = MatVec(layout, a_transpose, - n, n, static_cast<T>(1), - ap_buffer, ap_offset, n, - scratch_buffer, x_offset, x_inc, static_cast<T>(0), - x_buffer, x_offset, x_inc, - fast_kernels, fast_kernels, - parameter, true, 0, 0); - - // Returns the proper error code (renames vector Y to X) - switch(status) { - case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; - case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; - case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; - default: return status; + try { + MatVec(layout, a_transpose, + n, n, static_cast<T>(1), + ap_buffer, ap_offset, n, + scratch_buffer, x_offset, x_inc, static_cast<T>(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, true, 0, 0); + } catch (BLASError &e) { + // Returns the proper error code (renames vector Y to X) + switch (e.status()) { + case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details()); + case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details()); + case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details()); + default: throw; + } } } diff --git a/src/routines/level2/xtpmv.hpp b/src/routines/level2/xtpmv.hpp index 0e8cf1d2..5b3954e8 100644 --- a/src/routines/level2/xtpmv.hpp +++ b/src/routines/level2/xtpmv.hpp @@ -35,11 +35,11 @@ class Xtpmv: public Xgemv<T> { Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV"); // Templated-precision implementation of the routine - StatusCode DoTpmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoTpmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level2/xtrmv.cpp b/src/routines/level2/xtrmv.cpp index d2f24252..5fff9b31 100644 --- a/src/routines/level2/xtrmv.cpp +++ b/src/routines/level2/xtrmv.cpp @@ -29,17 +29,15 @@ Xtrmv<T>::Xtrmv(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Creates a copy of X: a temporary scratch buffer auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); - try { - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); - } catch (...) { } // Continues: error-code is returned in MatVec + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || @@ -52,20 +50,22 @@ StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle, // The specific triangular matrix-accesses are implemented in the kernel guarded by the // ROUTINE_TRMV define. auto fast_kernels = false; - auto status = MatVec(layout, a_transpose, - n, n, static_cast<T>(1), - a_buffer, a_offset, a_ld, - scratch_buffer, x_offset, x_inc, static_cast<T>(0), - x_buffer, x_offset, x_inc, - fast_kernels, fast_kernels, - parameter, false, 0, 0); - - // Returns the proper error code (renames vector Y to X) - switch(status) { - case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; - case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; - case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; - default: return status; + try { + MatVec(layout, a_transpose, + n, n, static_cast<T>(1), + a_buffer, a_offset, a_ld, + scratch_buffer, x_offset, x_inc, static_cast<T>(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, false, 0, 0); + } catch (BLASError &e) { + // Returns the proper error code (renames vector Y to X) + switch (e.status()) { + case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details()); + case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details()); + case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details()); + default: throw; + } } } diff --git a/src/routines/level2/xtrmv.hpp b/src/routines/level2/xtrmv.hpp index 07dd7841..b028ee68 100644 --- a/src/routines/level2/xtrmv.hpp +++ b/src/routines/level2/xtrmv.hpp @@ -35,11 +35,11 @@ class Xtrmv: public Xgemv<T> { Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV"); // Templated-precision implementation of the routine - StatusCode DoTrmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoTrmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 1602c69f..4f70dc7a 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -24,8 +24,7 @@ template <typename T> Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"}, - PrecisionValue<T>()) { - source_string_ = + PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" @@ -37,30 +36,28 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_direct_part1.opencl" #include "../../kernels/level3/xgemm_direct_part2.opencl" #include "../../kernels/level3/xgemm_direct_part3.opencl" - ; - auto source_string_part_2 = // separated in two parts to prevent C1091 in MSVC 2013 + , // separated in two parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" - ; - source_string_ += source_string_part_2; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xgemm<T>::DoGemm(const Layout layout, - const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { +void Xgemm<T>::DoGemm(const Layout layout, + const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; } + if ((m == 0) || (n == 0) || (k == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrices are transposed in memory. This is based on their layout // (row or column-major) and whether or not they are requested to be pre-transposed. Note @@ -99,12 +96,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // matrix A cannot be less than K when rotated, or less than M when not-rotated // matrix B cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N when rotated, or less than M when not-rotated - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); + TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld); // Selects which version of GEMM to run const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]); @@ -131,7 +125,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // requirements, but several pre and post-processing kernels take care of those. However, the // overhead of these extra kernels might not be ideal for certain devices/arguments. template <typename T> -StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, +void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, @@ -142,8 +136,6 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k const size_t a_one, const size_t a_two, const bool a_want_rotated, const size_t b_one, const size_t b_two, const bool b_want_rotated, const size_t c_one, const size_t c_two, const bool c_want_rotated) { - auto status = StatusCode::kSuccess; - // Calculates the ceiled versions of m, n, and k const auto m_ceiled = Ceil(m, db_["MWG"]); const auto n_ceiled = Ceil(n, db_["NWG"]); @@ -158,109 +150,95 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled; const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled; - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 && - a_do_transpose == false && a_conjugate == false; - auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 && - b_do_transpose == false && b_conjugate == false; - auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 && - c_do_transpose == false; - - // Creates the temporary matrices - const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i); - const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i); - const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i); - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector<Event>(); - auto emptyEventList = std::vector<Event>(); - - // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. - if (!a_no_temp) { - auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - a_one_i, a_two_i, a_one_i, 0, a_temp, - ConstantOne<T>(), program, - true, a_do_transpose, a_conjugate); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessA); - } - - // As above, but now for matrix B - if (!b_no_temp) { - auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, - b_one, b_two, b_ld, b_offset, b_buffer, - b_one_i, b_two_i, b_one_i, 0, b_temp, - ConstantOne<T>(), program, - true, b_do_transpose, b_conjugate); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessB); - } - - // As above, but now for matrix C. This is only necessary if C is used both as input and output. - if (!c_no_temp && beta != static_cast<T>(0)) { - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, - c_one, c_two, c_ld, c_offset, c_buffer, - c_one_i, c_two_i, c_one_i, 0, c_temp, - ConstantOne<T>(), program, - true, c_do_transpose, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessC); - } - - // Retrieves the Xgemm kernel from the compiled binary - try { - auto kernel = Kernel(program, "Xgemm"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(m_ceiled)); - kernel.SetArgument(1, static_cast<int>(n_ceiled)); - kernel.SetArgument(2, static_cast<int>(k_ceiled)); - kernel.SetArgument(3, GetRealArg(alpha)); - kernel.SetArgument(4, GetRealArg(beta)); - kernel.SetArgument(5, a_temp()); - kernel.SetArgument(6, b_temp()); - kernel.SetArgument(7, c_temp()); - - // Computes the global and local thread sizes - const auto global = std::vector<size_t>{ - (c_one_i * db_["MDIMC"]) / db_["MWG"], - (c_two_i * db_["NDIMC"]) / db_["NWG"] - }; - const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel = Event(); - auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; - status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel if needed - if (!c_no_temp) { - eventWaitList.push_back(eventKernel); - status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, - c_one_i, c_two_i, c_one_i, 0, c_temp, - c_one, c_two, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, - false, c_do_transpose, false); - if (ErrorIn(status)) { return status; } - } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 && + a_do_transpose == false && a_conjugate == false; + auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 && + b_do_transpose == false && b_conjugate == false; + auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 && + c_do_transpose == false; + + // Creates the temporary matrices + const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i); + const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i); + const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i); + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. + if (!a_no_temp) { + auto eventProcessA = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + a_one_i, a_two_i, a_one_i, 0, a_temp, + ConstantOne<T>(), program, + true, a_do_transpose, a_conjugate); + eventWaitList.push_back(eventProcessA); + } + + // As above, but now for matrix B + if (!b_no_temp) { + auto eventProcessB = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, + b_one, b_two, b_ld, b_offset, b_buffer, + b_one_i, b_two_i, b_one_i, 0, b_temp, + ConstantOne<T>(), program, + true, b_do_transpose, b_conjugate); + eventWaitList.push_back(eventProcessB); + } + + // As above, but now for matrix C. This is only necessary if C is used both as input and output. + if (!c_no_temp && beta != static_cast<T>(0)) { + auto eventProcessC = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, + c_one, c_two, c_ld, c_offset, c_buffer, + c_one_i, c_two_i, c_one_i, 0, c_temp, + ConstantOne<T>(), program, + true, c_do_transpose, false); + eventWaitList.push_back(eventProcessC); + } + + // Retrieves the Xgemm kernel from the compiled binary + auto kernel = Kernel(program, "Xgemm"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(m_ceiled)); + kernel.SetArgument(1, static_cast<int>(n_ceiled)); + kernel.SetArgument(2, static_cast<int>(k_ceiled)); + kernel.SetArgument(3, GetRealArg(alpha)); + kernel.SetArgument(4, GetRealArg(beta)); + kernel.SetArgument(5, a_temp()); + kernel.SetArgument(6, b_temp()); + kernel.SetArgument(7, c_temp()); + + // Computes the global and local thread sizes + const auto global = std::vector<size_t>{ + (c_one_i * db_["MDIMC"]) / db_["MWG"], + (c_two_i * db_["NDIMC"]) / db_["NWG"] + }; + const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel = Event(); + auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; + RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList); + + // Runs the post-processing kernel if needed + if (!c_no_temp) { + eventWaitList.push_back(eventKernel); + PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, + c_one_i, c_two_i, c_one_i, 0, c_temp, + c_one, c_two, c_ld, c_offset, c_buffer, + ConstantOne<T>(), program, + false, c_do_transpose, false); + } } @@ -268,7 +246,7 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k // The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels. template <typename T> -StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k, +void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, @@ -281,46 +259,40 @@ StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k, const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); // Retrieves the proper XgemmDirect kernel from the compiled binary - try { - const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : - (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); - auto kernel = Kernel(program, name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(m)); - kernel.SetArgument(1, static_cast<int>(n)); - kernel.SetArgument(2, static_cast<int>(k)); - kernel.SetArgument(3, GetRealArg(alpha)); - kernel.SetArgument(4, GetRealArg(beta)); - kernel.SetArgument(5, a_buffer()); - kernel.SetArgument(6, static_cast<int>(a_offset)); - kernel.SetArgument(7, static_cast<int>(a_ld)); - kernel.SetArgument(8, b_buffer()); - kernel.SetArgument(9, static_cast<int>(b_offset)); - kernel.SetArgument(10, static_cast<int>(b_ld)); - kernel.SetArgument(11, c_buffer()); - kernel.SetArgument(12, static_cast<int>(c_offset)); - kernel.SetArgument(13, static_cast<int>(c_ld)); - kernel.SetArgument(14, static_cast<int>(c_do_transpose)); - kernel.SetArgument(15, static_cast<int>(a_conjugate)); - kernel.SetArgument(16, static_cast<int>(b_conjugate)); - - // Computes the global and local thread sizes - const auto m_ceiled = Ceil(m, db_["WGD"]); - const auto n_ceiled = Ceil(n, db_["WGD"]); - const auto global = std::vector<size_t>{ - (m_ceiled * db_["MDIMCD"]) / db_["WGD"], - (n_ceiled * db_["NDIMCD"]) / db_["WGD"] - }; - const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]}; - - // Launches the kernel - auto status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : + (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); + auto kernel = Kernel(program, name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(m)); + kernel.SetArgument(1, static_cast<int>(n)); + kernel.SetArgument(2, static_cast<int>(k)); + kernel.SetArgument(3, GetRealArg(alpha)); + kernel.SetArgument(4, GetRealArg(beta)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast<int>(a_offset)); + kernel.SetArgument(7, static_cast<int>(a_ld)); + kernel.SetArgument(8, b_buffer()); + kernel.SetArgument(9, static_cast<int>(b_offset)); + kernel.SetArgument(10, static_cast<int>(b_ld)); + kernel.SetArgument(11, c_buffer()); + kernel.SetArgument(12, static_cast<int>(c_offset)); + kernel.SetArgument(13, static_cast<int>(c_ld)); + kernel.SetArgument(14, static_cast<int>(c_do_transpose)); + kernel.SetArgument(15, static_cast<int>(a_conjugate)); + kernel.SetArgument(16, static_cast<int>(b_conjugate)); + + // Computes the global and local thread sizes + const auto m_ceiled = Ceil(m, db_["WGD"]); + const auto n_ceiled = Ceil(n, db_["WGD"]); + const auto global = std::vector<size_t>{ + (m_ceiled * db_["MDIMCD"]) / db_["WGD"], + (n_ceiled * db_["NDIMCD"]) / db_["WGD"] + }; + const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]}; + + // Launches the kernel + RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp index 46e12453..c61611b6 100644 --- a/src/routines/level3/xgemm.hpp +++ b/src/routines/level3/xgemm.hpp @@ -28,36 +28,36 @@ class Xgemm: public Routine { Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM"); // Templated-precision implementation of the routine - StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, + void DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + + // Indirect version of GEMM (with pre and post-processing kernels) + void GemmIndirect(const size_t m, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); - - // Indirect version of GEMM (with pre and post-processing kernels) - StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld, - const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, - const bool a_conjugate, const bool b_conjugate, - const size_t a_one, const size_t a_two, const bool a_want_rotated, - const size_t b_one, const size_t b_two, const bool b_want_rotated, - const size_t c_one, const size_t c_two, const bool c_want_rotated); + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate, + const size_t a_one, const size_t a_two, const bool a_want_rotated, + const size_t b_one, const size_t b_two, const bool b_want_rotated, + const size_t c_one, const size_t c_two, const bool c_want_rotated); // Direct version of GEMM (no pre and post-processing kernels) - StatusCode GemmDirect(const size_t m, const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld, - const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, - const bool a_conjugate, const bool b_conjugate); + void GemmDirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate); }; // ================================================================================================= diff --git a/src/routines/level3/xhemm.cpp b/src/routines/level3/xhemm.cpp index 9813503e..e5b1502a 100644 --- a/src/routines/level3/xhemm.cpp +++ b/src/routines/level3/xhemm.cpp @@ -29,7 +29,7 @@ Xhemm<T>::Xhemm(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle, +void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, @@ -38,15 +38,14 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } + if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the // left) or B (on the right) in the Xgemm routine. auto k = (side == Side::kLeft) ? m : n; // Checks for validity of the squared A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(k, k, a_buffer, a_offset, a_ld); // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix @@ -55,73 +54,68 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared"; // Temporary buffer for a copy of the hermitian matrix - try { - auto temp_herm = Buffer<T>(context_, k*k); - - // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm - // routine afterwards + auto temp_herm = Buffer<T>(context_, k*k); + + // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm + // routine afterwards + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the hermitian-to-squared kernel + kernel.SetArgument(0, static_cast<int>(k)); + kernel.SetArgument(1, static_cast<int>(a_ld)); + kernel.SetArgument(2, static_cast<int>(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast<int>(k)); + kernel.SetArgument(5, static_cast<int>(k)); + kernel.SetArgument(6, static_cast<int>(0)); + kernel.SetArgument(7, temp_herm()); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // hermitian-to-squared kernel uses the same parameters. + auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + auto kernelEvent = Event(); + RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); + + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + + // Runs the regular Xgemm code with either "C := AB+C" or ... + if (side == Side::kLeft) { + DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + temp_herm, 0, k, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld); + } + + // ... with "C := BA+C". Note that A and B are now reversed. + else { try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the hermitian-to-squared kernel - kernel.SetArgument(0, static_cast<int>(k)); - kernel.SetArgument(1, static_cast<int>(a_ld)); - kernel.SetArgument(2, static_cast<int>(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast<int>(k)); - kernel.SetArgument(5, static_cast<int>(k)); - kernel.SetArgument(6, static_cast<int>(0)); - kernel.SetArgument(7, temp_herm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // hermitian-to-squared kernel uses the same parameters. - auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - - // Synchronize now: 'DoGemm' does not accept a list of events to wait for - kernelEvent.WaitForCompletion(); - - // Runs the regular Xgemm code with either "C := AB+C" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - temp_herm, 0, k, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld); - } - - // ... with "C := BA+C". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_herm, 0, k, - beta, - c_buffer, c_offset, c_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } + DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_herm, 0, k, + beta, + c_buffer, c_offset, c_ld); + } catch (BLASError &e) { + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(e.status()) { + case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details()); + case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details()); + case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details()); + case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details()); + case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details()); + case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details()); + default: throw; } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + } + } } // ================================================================================================= diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp index 272bd2ec..2385706e 100644 --- a/src/routines/level3/xhemm.hpp +++ b/src/routines/level3/xhemm.hpp @@ -37,13 +37,13 @@ class Xhemm: public Xgemm<T> { Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM"); // Templated-precision implementation of the routine - StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + void DoHemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index bf328729..ee3bb8b8 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -22,8 +22,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T, typename U> Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" @@ -32,23 +31,23 @@ Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T, typename U> -StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const U beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { +void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or // to matrix A (argument: conjugate transpose) @@ -71,12 +70,9 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co // matrix A cannot be less than N when rotated, or less than K when not-rotated // matrix B cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N - auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld); + TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld); + TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); @@ -85,145 +81,128 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - ab_rotated == false && ab_conjugate == false; - auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - ab_rotated == false && ab_conjugate == true; - auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && - ab_rotated == false && ab_conjugate == false; - auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && - ab_rotated == false && ab_conjugate == true; - - // Creates the temporary matrices - auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); - - // Convert the arguments to complex versions - auto complex_beta = T{beta, static_cast<U>(0.0)}; - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector<Event>(); - auto emptyEventList = std::vector<Event>(); - - // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. - if (!a1_no_temp) { - auto eventProcessA1 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList, - ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, - ConstantOne<T>(), program, - true, ab_rotated, ab_conjugate); - eventWaitList.push_back(eventProcessA1); - if (ErrorIn(status)) { return status; } - } - if (!a2_no_temp) { - auto eventProcessA2 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList, - ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, - ConstantOne<T>(), program, - true, ab_rotated, !ab_conjugate); - eventWaitList.push_back(eventProcessA2); - if (ErrorIn(status)) { return status; } - } - if (!b1_no_temp) { - auto eventProcessB1 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList, - ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, - ConstantOne<T>(), program, - true, ab_rotated, ab_conjugate); - eventWaitList.push_back(eventProcessB1); - if (ErrorIn(status)) { return status; } - } - if (!b2_no_temp) { - auto eventProcessB2 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList, - ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, - ConstantOne<T>(), program, - true, ab_rotated, !ab_conjugate); - eventWaitList.push_back(eventProcessB2); - if (ErrorIn(status)) { return status; } - } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, - n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, - true, c_rotated, false); - eventWaitList.push_back(eventProcessC); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(n_ceiled)); - kernel.SetArgument(1, static_cast<int>(k_ceiled)); - kernel.SetArgument(2, GetRealArg(alpha)); - kernel.SetArgument(3, GetRealArg(complex_beta)); - kernel.SetArgument(4, a1_temp()); - kernel.SetArgument(5, b2_temp()); - kernel.SetArgument(6, c_temp()); - - // Computes the global and local thread sizes - auto global = std::vector<size_t>{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel1 = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel1); - - // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha - auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; - auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)}; - kernel.SetArgument(2, GetRealArg(conjugate_alpha)); - kernel.SetArgument(3, GetRealArg(complex_one)); - kernel.SetArgument(4, b1_temp()); - kernel.SetArgument(5, a2_temp()); - - // Runs the kernel again - auto eventKernel2 = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel2); - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, - false, c_rotated, false, upper, lower, true); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + ab_rotated == false && ab_conjugate == false; + auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + ab_rotated == false && ab_conjugate == true; + auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && + ab_rotated == false && ab_conjugate == false; + auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && + ab_rotated == false && ab_conjugate == true; + + // Creates the temporary matrices + auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + + // Convert the arguments to complex versions + auto complex_beta = T{beta, static_cast<U>(0.0)}; + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. + if (!a1_no_temp) { + auto eventProcessA1 = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, + ConstantOne<T>(), program, + true, ab_rotated, ab_conjugate); + eventWaitList.push_back(eventProcessA1); + } + if (!a2_no_temp) { + auto eventProcessA2 = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, + ConstantOne<T>(), program, + true, ab_rotated, !ab_conjugate); + eventWaitList.push_back(eventProcessA2); + } + if (!b1_no_temp) { + auto eventProcessB1 = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, + ConstantOne<T>(), program, + true, ab_rotated, ab_conjugate); + eventWaitList.push_back(eventProcessB1); + } + if (!b2_no_temp) { + auto eventProcessB2 = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, + ConstantOne<T>(), program, + true, ab_rotated, !ab_conjugate); + eventWaitList.push_back(eventProcessB2); + } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + auto eventProcessC = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + ConstantOne<T>(), program, + true, c_rotated, false); + eventWaitList.push_back(eventProcessC); + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n_ceiled)); + kernel.SetArgument(1, static_cast<int>(k_ceiled)); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(complex_beta)); + kernel.SetArgument(4, a1_temp()); + kernel.SetArgument(5, b2_temp()); + kernel.SetArgument(6, c_temp()); + + // Computes the global and local thread sizes + auto global = std::vector<size_t>{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel1 = Event(); + RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); + eventWaitList.push_back(eventKernel1); + + // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha + auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; + auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)}; + kernel.SetArgument(2, GetRealArg(conjugate_alpha)); + kernel.SetArgument(3, GetRealArg(complex_one)); + kernel.SetArgument(4, b1_temp()); + kernel.SetArgument(5, a2_temp()); + + // Runs the kernel again + auto eventKernel2 = Event(); + RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); + eventWaitList.push_back(eventKernel2); + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + n, n, c_ld, c_offset, c_buffer, + ConstantOne<T>(), program, + false, c_rotated, false, upper, lower, true); } // ================================================================================================= diff --git a/src/routines/level3/xher2k.hpp b/src/routines/level3/xher2k.hpp index 23996219..acc346e4 100644 --- a/src/routines/level3/xher2k.hpp +++ b/src/routines/level3/xher2k.hpp @@ -30,13 +30,13 @@ class Xher2k: public Routine { Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K"); // Templated-precision implementation of the routine - StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const U beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + void DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index 77422526..ae8e9324 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -22,8 +22,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T, typename U> Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" @@ -32,14 +31,14 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T, typename U> -StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const U alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, @@ -47,7 +46,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or // to matrix A (argument: conjugate transpose) @@ -70,10 +69,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons // space. Also tests that the leading dimensions of: // matrix A cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); @@ -82,106 +79,92 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - a_rotated == false && a_conjugate == false; - auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - a_rotated == false && b_conjugate == false; - - // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); - - // Convert the arguments to complex versions - auto complex_alpha = T{alpha, static_cast<U>(0.0)}; - auto complex_beta = T{beta, static_cast<U>(0.0)}; - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector<Event>(); - auto emptyEventList = std::vector<Event>(); - - // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. Two copies are created. - if (!a_no_temp) { - auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, - true, a_rotated, a_conjugate); - eventWaitList.push_back(eventProcessA); - if (ErrorIn(status)) { return status; } - } - if (!b_no_temp) { - auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, - true, a_rotated, b_conjugate); - eventWaitList.push_back(eventProcessB); - if (ErrorIn(status)) { return status; } - } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, - n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, - true, c_rotated, false); - eventWaitList.push_back(eventProcessC); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(n_ceiled)); - kernel.SetArgument(1, static_cast<int>(k_ceiled)); - kernel.SetArgument(2, GetRealArg(complex_alpha)); - kernel.SetArgument(3, GetRealArg(complex_beta)); - kernel.SetArgument(4, a_temp()); - kernel.SetArgument(5, b_temp()); - kernel.SetArgument(6, c_temp()); - - // Computes the global and local thread sizes - auto global = std::vector<size_t>{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel); - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, - false, c_rotated, false, upper, lower, true); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + a_rotated == false && a_conjugate == false; + auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + a_rotated == false && b_conjugate == false; + + // Creates the temporary matrices + auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + + // Convert the arguments to complex versions + auto complex_alpha = T{alpha, static_cast<U>(0.0)}; + auto complex_beta = T{beta, static_cast<U>(0.0)}; + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. Two copies are created. + if (!a_no_temp) { + auto eventProcessA = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a_temp, + ConstantOne<T>(), program, + true, a_rotated, a_conjugate); + eventWaitList.push_back(eventProcessA); + } + if (!b_no_temp) { + auto eventProcessB = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b_temp, + ConstantOne<T>(), program, + true, a_rotated, b_conjugate); + eventWaitList.push_back(eventProcessB); + } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + auto eventProcessC = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + ConstantOne<T>(), program, + true, c_rotated, false); + eventWaitList.push_back(eventProcessC); + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n_ceiled)); + kernel.SetArgument(1, static_cast<int>(k_ceiled)); + kernel.SetArgument(2, GetRealArg(complex_alpha)); + kernel.SetArgument(3, GetRealArg(complex_beta)); + kernel.SetArgument(4, a_temp()); + kernel.SetArgument(5, b_temp()); + kernel.SetArgument(6, c_temp()); + + // Computes the global and local thread sizes + auto global = std::vector<size_t>{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel = Event(); + RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); + eventWaitList.push_back(eventKernel); + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + n, n, c_ld, c_offset, c_buffer, + ConstantOne<T>(), program, + false, c_rotated, false, upper, lower, true); } // ================================================================================================= diff --git a/src/routines/level3/xherk.hpp b/src/routines/level3/xherk.hpp index 3f156a1b..51f29d7e 100644 --- a/src/routines/level3/xherk.hpp +++ b/src/routines/level3/xherk.hpp @@ -30,12 +30,12 @@ class Xherk: public Routine { Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK"); // Templated-precision implementation of the routine - StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const U alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const U beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + void DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const U alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const U beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/src/routines/level3/xsymm.cpp b/src/routines/level3/xsymm.cpp index 04e4b718..d7f771d1 100644 --- a/src/routines/level3/xsymm.cpp +++ b/src/routines/level3/xsymm.cpp @@ -29,7 +29,7 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle, +void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, @@ -38,15 +38,14 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } + if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the // left) or B (on the right) in the Xgemm routine. auto k = (side == Side::kLeft) ? m : n; // Checks for validity of the squared A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(k, k, a_buffer, a_offset, a_ld); // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix @@ -55,73 +54,68 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; // Temporary buffer for a copy of the symmetric matrix - try { - auto temp_symm = Buffer<T>(context_, k*k); - - // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm - // routine afterwards + auto temp_symm = Buffer<T>(context_, k*k); + + // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm + // routine afterwards + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the symmetric-to-squared kernel + kernel.SetArgument(0, static_cast<int>(k)); + kernel.SetArgument(1, static_cast<int>(a_ld)); + kernel.SetArgument(2, static_cast<int>(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast<int>(k)); + kernel.SetArgument(5, static_cast<int>(k)); + kernel.SetArgument(6, static_cast<int>(0)); + kernel.SetArgument(7, temp_symm()); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // symmetric-to-squared kernel uses the same parameters. + auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + auto kernelEvent = Event(); + RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); + + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + + // Runs the regular Xgemm code with either "C := AB+C" or ... + if (side == Side::kLeft) { + DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + temp_symm, 0, k, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld); + } + + // ... with "C := BA+C". Note that A and B are now reversed. + else { try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the symmetric-to-squared kernel - kernel.SetArgument(0, static_cast<int>(k)); - kernel.SetArgument(1, static_cast<int>(a_ld)); - kernel.SetArgument(2, static_cast<int>(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast<int>(k)); - kernel.SetArgument(5, static_cast<int>(k)); - kernel.SetArgument(6, static_cast<int>(0)); - kernel.SetArgument(7, temp_symm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // symmetric-to-squared kernel uses the same parameters. - auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - - // Synchronize now: 'DoGemm' does not accept a list of events to wait for - kernelEvent.WaitForCompletion(); - - // Runs the regular Xgemm code with either "C := AB+C" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - temp_symm, 0, k, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld); - } - - // ... with "C := BA+C". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_symm, 0, k, - beta, - c_buffer, c_offset, c_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } + DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_symm, 0, k, + beta, + c_buffer, c_offset, c_ld); + } catch (BLASError &e) { + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(e.status()) { + case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details()); + case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details()); + case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details()); + case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details()); + case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details()); + case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details()); + default: throw; } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + } + } } // ================================================================================================= diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp index 428f78ef..ee965364 100644 --- a/src/routines/level3/xsymm.hpp +++ b/src/routines/level3/xsymm.hpp @@ -39,13 +39,13 @@ class Xsymm: public Xgemm<T> { Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM"); // Templated-precision implementation of the routine - StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + void DoSymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index badf3100..cb0e0461 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -22,8 +22,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" @@ -32,14 +31,14 @@ Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, @@ -48,7 +47,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrices are transposed in memory. This is based on their layout // (row or column-major) and whether or not they are requested to be pre-transposed. @@ -67,12 +66,9 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons // matrix A cannot be less than N when rotated, or less than K when not-rotated // matrix B cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N - auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld); + TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld); + TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); @@ -81,114 +77,99 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - ab_rotated == false; - auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && - ab_rotated == false; - - // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector<Event>(); - auto emptyEventList = std::vector<Event>(); - - // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. - if (!a_no_temp) { - auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, - ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, - true, ab_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessA); - } - if (!b_no_temp) { - auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, - ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, - true, ab_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessB); - } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, - n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, - true, c_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessC); - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(n_ceiled)); - kernel.SetArgument(1, static_cast<int>(k_ceiled)); - kernel.SetArgument(2, GetRealArg(alpha)); - kernel.SetArgument(3, GetRealArg(beta)); - kernel.SetArgument(4, a_temp()); - kernel.SetArgument(5, b_temp()); - kernel.SetArgument(6, c_temp()); - - // Computes the global and local thread sizes - auto global = std::vector<size_t>{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel1 = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel1); - - // Swaps the arguments for matrices A and B, and sets 'beta' to 1 - auto one = static_cast<T>(1); - kernel.SetArgument(3, GetRealArg(one)); - kernel.SetArgument(4, b_temp()); - kernel.SetArgument(5, a_temp()); - - // Runs the kernel again - auto eventKernel2 = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel2); - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, - false, c_rotated, false, upper, lower, false); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + ab_rotated == false; + auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && + ab_rotated == false; + + // Creates the temporary matrices + auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. + if (!a_no_temp) { + auto eventProcessA = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a_temp, + ConstantOne<T>(), program, + true, ab_rotated, false); + eventWaitList.push_back(eventProcessA); + } + if (!b_no_temp) { + auto eventProcessB = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b_temp, + ConstantOne<T>(), program, + true, ab_rotated, false); + eventWaitList.push_back(eventProcessB); + } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + auto eventProcessC = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + ConstantOne<T>(), program, + true, c_rotated, false); + eventWaitList.push_back(eventProcessC); + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n_ceiled)); + kernel.SetArgument(1, static_cast<int>(k_ceiled)); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); + kernel.SetArgument(4, a_temp()); + kernel.SetArgument(5, b_temp()); + kernel.SetArgument(6, c_temp()); + + // Computes the global and local thread sizes + auto global = std::vector<size_t>{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel1 = Event(); + RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); + eventWaitList.push_back(eventKernel1); + + // Swaps the arguments for matrices A and B, and sets 'beta' to 1 + auto one = static_cast<T>(1); + kernel.SetArgument(3, GetRealArg(one)); + kernel.SetArgument(4, b_temp()); + kernel.SetArgument(5, a_temp()); + + // Runs the kernel again + auto eventKernel2 = Event(); + RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); + eventWaitList.push_back(eventKernel2); + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + n, n, c_ld, c_offset, c_buffer, + ConstantOne<T>(), program, + false, c_rotated, false, upper, lower, false); } // ================================================================================================= diff --git a/src/routines/level3/xsyr2k.hpp b/src/routines/level3/xsyr2k.hpp index 56185653..a02c6e16 100644 --- a/src/routines/level3/xsyr2k.hpp +++ b/src/routines/level3/xsyr2k.hpp @@ -30,13 +30,13 @@ class Xsyr2k: public Routine { Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K"); // Templated-precision implementation of the routine - StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + void DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index 438aa218..bd6c4b25 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -22,8 +22,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" @@ -32,14 +31,14 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, @@ -47,7 +46,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrices are transposed in memory. This is based on their layout // (row or column-major) and whether or not they are requested to be pre-transposed. @@ -65,10 +64,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const // space. Also tests that the leading dimensions of: // matrix A cannot be less than N when rotated, or less than K when not-rotated // matrix C cannot be less than N - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); @@ -77,90 +74,76 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - a_rotated == false; - - // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector<Event>(); - auto emptyEventList = std::vector<Event>(); - - // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. - if (!a_no_temp) { - auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, - true, a_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessA); - } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, - n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, - true, c_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessC); - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast<int>(n_ceiled)); - kernel.SetArgument(1, static_cast<int>(k_ceiled)); - kernel.SetArgument(2, GetRealArg(alpha)); - kernel.SetArgument(3, GetRealArg(beta)); - kernel.SetArgument(4, a_temp()); - kernel.SetArgument(5, a_temp()); - kernel.SetArgument(6, c_temp()); - - // Computes the global and local thread sizes - auto global = std::vector<size_t>{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel); - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, - false, c_rotated, false, upper, lower, false); - if (ErrorIn(status)) { return status; } - - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + a_rotated == false; + + // Creates the temporary matrices + auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. + if (!a_no_temp) { + auto eventProcessA = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a_temp, + ConstantOne<T>(), program, + true, a_rotated, false); + eventWaitList.push_back(eventProcessA); + } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + auto eventProcessC = Event(); + PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + ConstantOne<T>(), program, + true, c_rotated, false); + eventWaitList.push_back(eventProcessC); + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n_ceiled)); + kernel.SetArgument(1, static_cast<int>(k_ceiled)); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); + kernel.SetArgument(4, a_temp()); + kernel.SetArgument(5, a_temp()); + kernel.SetArgument(6, c_temp()); + + // Computes the global and local thread sizes + auto global = std::vector<size_t>{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel = Event(); + RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); + eventWaitList.push_back(eventKernel); + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + n, n, c_ld, c_offset, c_buffer, + ConstantOne<T>(), program, + false, c_rotated, false, upper, lower, false); } // ================================================================================================= diff --git a/src/routines/level3/xsyrk.hpp b/src/routines/level3/xsyrk.hpp index 7c075c26..de42b824 100644 --- a/src/routines/level3/xsyrk.hpp +++ b/src/routines/level3/xsyrk.hpp @@ -32,12 +32,12 @@ class Xsyrk: public Routine { Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK"); // Templated-precision implementation of the routine - StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + void DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp index 74a82822..6bf77cfa 100644 --- a/src/routines/level3/xtrmm.cpp +++ b/src/routines/level3/xtrmm.cpp @@ -29,7 +29,7 @@ Xtrmm<T>::Xtrmm(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle, +void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, @@ -37,15 +37,14 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) { // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } + if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Computes the k dimension. This is based on whether or not matrix is A (on the left) // or B (on the right) in the Xgemm routine. auto k = (side == Side::kLeft) ? m : n; // Checks for validity of the triangular A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(k, k, a_buffer, a_offset, a_ld); // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix @@ -57,74 +56,69 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false; // Temporary buffer for a copy of the triangular matrix - try { - auto temp_triangular = Buffer<T>(context_, k*k); - - // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm - // routine afterwards + auto temp_triangular = Buffer<T>(context_, k*k); + + // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm + // routine afterwards + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the triangular-to-squared kernel + kernel.SetArgument(0, static_cast<int>(k)); + kernel.SetArgument(1, static_cast<int>(a_ld)); + kernel.SetArgument(2, static_cast<int>(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast<int>(k)); + kernel.SetArgument(5, static_cast<int>(k)); + kernel.SetArgument(6, static_cast<int>(0)); + kernel.SetArgument(7, temp_triangular()); + kernel.SetArgument(8, static_cast<int>(unit_diagonal)); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // triangular-to-squared kernel uses the same parameters. + auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + auto kernelEvent = Event(); + RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); + + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + + // Runs the regular Xgemm code with either "B := alpha*A*B" or ... + if (side == Side::kLeft) { + DoGemm(layout, a_transpose, Transpose::kNo, + m, n, k, + alpha, + temp_triangular, 0, k, + b_buffer, b_offset, b_ld, + static_cast<T>(0.0), + b_buffer, b_offset, b_ld); + } + + // ... with "B := alpha*B*A". Note that A and B are now reversed. + else { try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the triangular-to-squared kernel - kernel.SetArgument(0, static_cast<int>(k)); - kernel.SetArgument(1, static_cast<int>(a_ld)); - kernel.SetArgument(2, static_cast<int>(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast<int>(k)); - kernel.SetArgument(5, static_cast<int>(k)); - kernel.SetArgument(6, static_cast<int>(0)); - kernel.SetArgument(7, temp_triangular()); - kernel.SetArgument(8, static_cast<int>(unit_diagonal)); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // triangular-to-squared kernel uses the same parameters. - auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - - // Synchronize now: 'DoGemm' does not accept a list of events to wait for - kernelEvent.WaitForCompletion(); - - // Runs the regular Xgemm code with either "B := alpha*A*B" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, a_transpose, Transpose::kNo, - m, n, k, - alpha, - temp_triangular, 0, k, - b_buffer, b_offset, b_ld, - static_cast<T>(0.0), - b_buffer, b_offset, b_ld); - } - - // ... with "B := alpha*B*A". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, a_transpose, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_triangular, 0, k, - static_cast<T>(0.0), - b_buffer, b_offset, b_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } + DoGemm(layout, Transpose::kNo, a_transpose, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_triangular, 0, k, + static_cast<T>(0.0), + b_buffer, b_offset, b_ld); + } catch (BLASError &e) { + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(e.status()) { + case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details()); + case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details()); + case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details()); + case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details()); + case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details()); + case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details()); + default: throw; } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + } + } } // ================================================================================================= diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp index 186a120e..967bf132 100644 --- a/src/routines/level3/xtrmm.hpp +++ b/src/routines/level3/xtrmm.hpp @@ -38,12 +38,12 @@ class Xtrmm: public Xgemm<T> { Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM"); // Templated-precision implementation of the routine - StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld); + void DoTrmm(const Layout layout, const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld); }; // ================================================================================================= diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp index af9080af..875ca7d2 100644 --- a/src/routines/levelx/xomatcopy.cpp +++ b/src/routines/levelx/xomatcopy.cpp @@ -22,27 +22,26 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xomatcopy<T>::Xomatcopy(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue<T>()) { - source_string_ = + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" #include "../../kernels/level3/copy_pad.opencl" #include "../../kernels/level3/transpose_fast.opencl" #include "../../kernels/level3/transpose_pad.opencl" - ; + }) { } // ================================================================================================= // The main routine template <typename T> -StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) { +void Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) { // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } + if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Determines whether to transpose the matrix A const auto transpose = (a_transpose != Transpose::kNo); @@ -63,22 +62,17 @@ StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_trans // Also tests that the leading dimensions of: // matrix A cannot be less than N when rotated, or less than M when not-rotated // matrix B cannot be less than M when rotated, or less than N when not-rotated - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); - if (ErrorIn(status)) { return status; } + TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); // Loads the program from the database const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); auto emptyEventList = std::vector<Event>(); - status = PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - b_one, b_two, b_ld, b_offset, b_buffer, - alpha, program, false, transpose, conjugate); - if (ErrorIn(status)) { return status; } - - return StatusCode::kSuccess; + PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + b_one, b_two, b_ld, b_offset, b_buffer, + alpha, program, false, transpose, conjugate); } // ================================================================================================= diff --git a/src/routines/levelx/xomatcopy.hpp b/src/routines/levelx/xomatcopy.hpp index 0e580230..2da66693 100644 --- a/src/routines/levelx/xomatcopy.hpp +++ b/src/routines/levelx/xomatcopy.hpp @@ -28,10 +28,10 @@ class Xomatcopy: public Routine { Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY"); // Templated-precision implementation of the routine - StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld); + void DoOmatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld); }; // ================================================================================================= |