diff options
Diffstat (limited to 'src/routines/level1')
-rw-r--r-- | src/routines/level1/xamax.cpp | 95 | ||||
-rw-r--r-- | src/routines/level1/xamax.hpp | 6 | ||||
-rw-r--r-- | src/routines/level1/xasum.cpp | 89 | ||||
-rw-r--r-- | src/routines/level1/xasum.hpp | 6 | ||||
-rw-r--r-- | src/routines/level1/xaxpy.cpp | 86 | ||||
-rw-r--r-- | src/routines/level1/xaxpy.hpp | 6 | ||||
-rw-r--r-- | src/routines/level1/xcopy.cpp | 82 | ||||
-rw-r--r-- | src/routines/level1/xcopy.hpp | 6 | ||||
-rw-r--r-- | src/routines/level1/xdot.cpp | 104 | ||||
-rw-r--r-- | src/routines/level1/xdot.hpp | 10 | ||||
-rw-r--r-- | src/routines/level1/xdotc.cpp | 16 | ||||
-rw-r--r-- | src/routines/level1/xdotc.hpp | 8 | ||||
-rw-r--r-- | src/routines/level1/xdotu.cpp | 16 | ||||
-rw-r--r-- | src/routines/level1/xdotu.hpp | 8 | ||||
-rw-r--r-- | src/routines/level1/xmax.hpp | 8 | ||||
-rw-r--r-- | src/routines/level1/xmin.hpp | 8 | ||||
-rw-r--r-- | src/routines/level1/xnrm2.cpp | 89 | ||||
-rw-r--r-- | src/routines/level1/xnrm2.hpp | 6 | ||||
-rw-r--r-- | src/routines/level1/xscal.cpp | 73 | ||||
-rw-r--r-- | src/routines/level1/xscal.hpp | 4 | ||||
-rw-r--r-- | src/routines/level1/xsum.hpp | 8 | ||||
-rw-r--r-- | src/routines/level1/xswap.cpp | 82 | ||||
-rw-r--r-- | src/routines/level1/xswap.hpp | 6 |
23 files changed, 377 insertions, 445 deletions
diff --git a/src/routines/level1/xamax.cpp b/src/routines/level1/xamax.cpp index 6b6e7f9e..8307188b 100644 --- a/src/routines/level1/xamax.cpp +++ b/src/routines/level1/xamax.cpp @@ -32,64 +32,55 @@ Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xamax<T>::DoAmax(const size_t n, - const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xamax<T>::DoAmax(const size_t n, + const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorIndex(1, imax_buffer, imax_offset); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorIndex(1, imax_buffer, imax_offset); // Retrieves the Xamax kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xamax"); - auto kernel2 = Kernel(program, "XamaxEpilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer1 = Buffer<T>(context_, temp_size); - auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast<int>(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast<int>(x_offset)); - kernel1.SetArgument(3, static_cast<int>(x_inc)); - kernel1.SetArgument(4, temp_buffer1()); - kernel1.SetArgument(5, temp_buffer2()); - - // Event waiting list - auto eventWaitList = std::vector<Event>(); - - // Launches the main kernel - auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; - auto local1 = std::vector<size_t>{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer1()); - kernel2.SetArgument(1, temp_buffer2()); - kernel2.SetArgument(2, imax_buffer()); - kernel2.SetArgument(3, static_cast<int>(imax_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector<size_t>{db_["WGS2"]}; - auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel1 = Kernel(program, "Xamax"); + auto kernel2 = Kernel(program, "XamaxEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer1 = Buffer<T>(context_, temp_size); + auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast<int>(x_offset)); + kernel1.SetArgument(3, static_cast<int>(x_inc)); + kernel1.SetArgument(4, temp_buffer1()); + kernel1.SetArgument(5, temp_buffer2()); + + // Event waiting list + auto eventWaitList = std::vector<Event>(); + + // Launches the main kernel + auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; + auto local1 = std::vector<size_t>{db_["WGS1"]}; + auto kernelEvent = Event(); + RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer1()); + kernel2.SetArgument(1, temp_buffer2()); + kernel2.SetArgument(2, imax_buffer()); + kernel2.SetArgument(3, static_cast<int>(imax_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector<size_t>{db_["WGS2"]}; + auto local2 = std::vector<size_t>{db_["WGS2"]}; + RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= diff --git a/src/routines/level1/xamax.hpp b/src/routines/level1/xamax.hpp index aa45a8e4..4d1e0082 100644 --- a/src/routines/level1/xamax.hpp +++ b/src/routines/level1/xamax.hpp @@ -28,9 +28,9 @@ class Xamax: public Routine { Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX"); // Templated-precision implementation of the routine - StatusCode DoAmax(const size_t n, - const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoAmax(const size_t n, + const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xasum.cpp b/src/routines/level1/xasum.cpp index 0c1ce903..9dde7a87 100644 --- a/src/routines/level1/xasum.cpp +++ b/src/routines/level1/xasum.cpp @@ -32,61 +32,52 @@ Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xasum<T>::DoAsum(const size_t n, - const Buffer<T> &asum_buffer, const size_t asum_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xasum<T>::DoAsum(const size_t n, + const Buffer<T> &asum_buffer, const size_t asum_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorScalar(1, asum_buffer, asum_offset); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorScalar(1, asum_buffer, asum_offset); // Retrieves the Xasum kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xasum"); - auto kernel2 = Kernel(program, "XasumEpilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer = Buffer<T>(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast<int>(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast<int>(x_offset)); - kernel1.SetArgument(3, static_cast<int>(x_inc)); - kernel1.SetArgument(4, temp_buffer()); - - // Event waiting list - auto eventWaitList = std::vector<Event>(); - - // Launches the main kernel - auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; - auto local1 = std::vector<size_t>{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer()); - kernel2.SetArgument(1, asum_buffer()); - kernel2.SetArgument(2, static_cast<int>(asum_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector<size_t>{db_["WGS2"]}; - auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel1 = Kernel(program, "Xasum"); + auto kernel2 = Kernel(program, "XasumEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer<T>(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast<int>(x_offset)); + kernel1.SetArgument(3, static_cast<int>(x_inc)); + kernel1.SetArgument(4, temp_buffer()); + + // Event waiting list + auto eventWaitList = std::vector<Event>(); + + // Launches the main kernel + auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; + auto local1 = std::vector<size_t>{db_["WGS1"]}; + auto kernelEvent = Event(); + RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, asum_buffer()); + kernel2.SetArgument(2, static_cast<int>(asum_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector<size_t>{db_["WGS2"]}; + auto local2 = std::vector<size_t>{db_["WGS2"]}; + RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= diff --git a/src/routines/level1/xasum.hpp b/src/routines/level1/xasum.hpp index 5a253f4d..0afcc4ff 100644 --- a/src/routines/level1/xasum.hpp +++ b/src/routines/level1/xasum.hpp @@ -28,9 +28,9 @@ class Xasum: public Routine { Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM"); // Templated-precision implementation of the routine - StatusCode DoAsum(const size_t n, - const Buffer<T> &asum_buffer, const size_t asum_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoAsum(const size_t n, + const Buffer<T> &asum_buffer, const size_t asum_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp index 3445e2b5..cbcbb3cd 100644 --- a/src/routines/level1/xaxpy.cpp +++ b/src/routines/level1/xaxpy.cpp @@ -33,18 +33,16 @@ Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xaxpy<T>::DoAxpy(const size_t n, const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && @@ -55,45 +53,39 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha, auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; // Retrieves the Xaxpy kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, GetRealArg(alpha)); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, GetRealArg(alpha)); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast<int>(x_offset)); - kernel.SetArgument(4, static_cast<int>(x_inc)); - kernel.SetArgument(5, y_buffer()); - kernel.SetArgument(6, static_cast<int>(y_offset)); - kernel.SetArgument(7, static_cast<int>(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, GetRealArg(alpha)); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, GetRealArg(alpha)); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast<int>(x_offset)); + kernel.SetArgument(4, static_cast<int>(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, static_cast<int>(y_offset)); + kernel.SetArgument(7, static_cast<int>(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } } // ================================================================================================= diff --git a/src/routines/level1/xaxpy.hpp b/src/routines/level1/xaxpy.hpp index caac871e..9b30dfaa 100644 --- a/src/routines/level1/xaxpy.hpp +++ b/src/routines/level1/xaxpy.hpp @@ -28,9 +28,9 @@ class Xaxpy: public Routine { Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY"); // Templated-precision implementation of the routine - StatusCode DoAxpy(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoAxpy(const size_t n, const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xcopy.cpp b/src/routines/level1/xcopy.cpp index 673ef349..3bfbada6 100644 --- a/src/routines/level1/xcopy.cpp +++ b/src/routines/level1/xcopy.cpp @@ -33,18 +33,16 @@ Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xcopy<T>::DoCopy(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xcopy<T>::DoCopy(const size_t n, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && @@ -55,43 +53,37 @@ StatusCode Xcopy<T>::DoCopy(const size_t n, auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy"; // Retrieves the Xcopy kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, static_cast<int>(x_offset)); - kernel.SetArgument(3, static_cast<int>(x_inc)); - kernel.SetArgument(4, y_buffer()); - kernel.SetArgument(5, static_cast<int>(y_offset)); - kernel.SetArgument(6, static_cast<int>(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, static_cast<int>(x_offset)); + kernel.SetArgument(3, static_cast<int>(x_inc)); + kernel.SetArgument(4, y_buffer()); + kernel.SetArgument(5, static_cast<int>(y_offset)); + kernel.SetArgument(6, static_cast<int>(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } } // ================================================================================================= diff --git a/src/routines/level1/xcopy.hpp b/src/routines/level1/xcopy.hpp index 0c424ba3..a6454fcc 100644 --- a/src/routines/level1/xcopy.hpp +++ b/src/routines/level1/xcopy.hpp @@ -28,9 +28,9 @@ class Xcopy: public Routine { Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY"); // Templated-precision implementation of the routine - StatusCode DoCopy(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoCopy(const size_t n, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xdot.cpp b/src/routines/level1/xdot.cpp index bafea157..e0d297f8 100644 --- a/src/routines/level1/xdot.cpp +++ b/src/routines/level1/xdot.cpp @@ -32,69 +32,59 @@ Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xdot<T>::DoDot(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const bool do_conjugate) { +void Xdot<T>::DoDot(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const bool do_conjugate) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorScalar(1, dot_buffer, dot_offset); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); + TestVectorScalar(1, dot_buffer, dot_offset); // Retrieves the Xdot kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xdot"); - auto kernel2 = Kernel(program, "XdotEpilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer = Buffer<T>(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast<int>(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast<int>(x_offset)); - kernel1.SetArgument(3, static_cast<int>(x_inc)); - kernel1.SetArgument(4, y_buffer()); - kernel1.SetArgument(5, static_cast<int>(y_offset)); - kernel1.SetArgument(6, static_cast<int>(y_inc)); - kernel1.SetArgument(7, temp_buffer()); - kernel1.SetArgument(8, static_cast<int>(do_conjugate)); - - // Event waiting list - auto eventWaitList = std::vector<Event>(); - - // Launches the main kernel - auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; - auto local1 = std::vector<size_t>{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer()); - kernel2.SetArgument(1, dot_buffer()); - kernel2.SetArgument(2, static_cast<int>(dot_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector<size_t>{db_["WGS2"]}; - auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel1 = Kernel(program, "Xdot"); + auto kernel2 = Kernel(program, "XdotEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer<T>(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast<int>(x_offset)); + kernel1.SetArgument(3, static_cast<int>(x_inc)); + kernel1.SetArgument(4, y_buffer()); + kernel1.SetArgument(5, static_cast<int>(y_offset)); + kernel1.SetArgument(6, static_cast<int>(y_inc)); + kernel1.SetArgument(7, temp_buffer()); + kernel1.SetArgument(8, static_cast<int>(do_conjugate)); + + // Event waiting list + auto eventWaitList = std::vector<Event>(); + + // Launches the main kernel + auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; + auto local1 = std::vector<size_t>{db_["WGS1"]}; + auto kernelEvent = Event(); + RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, dot_buffer()); + kernel2.SetArgument(2, static_cast<int>(dot_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector<size_t>{db_["WGS2"]}; + auto local2 = std::vector<size_t>{db_["WGS2"]}; + RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= diff --git a/src/routines/level1/xdot.hpp b/src/routines/level1/xdot.hpp index 02c1efaa..a4c9dfa0 100644 --- a/src/routines/level1/xdot.hpp +++ b/src/routines/level1/xdot.hpp @@ -28,11 +28,11 @@ class Xdot: public Routine { Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT"); // Templated-precision implementation of the routine - StatusCode DoDot(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const bool do_conjugate = false); + void DoDot(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + const bool do_conjugate = false); }; // ================================================================================================= diff --git a/src/routines/level1/xdotc.cpp b/src/routines/level1/xdotc.cpp index 27cf2bab..5a4e939a 100644 --- a/src/routines/level1/xdotc.cpp +++ b/src/routines/level1/xdotc.cpp @@ -29,14 +29,14 @@ Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xdotc<T>::DoDotc(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { - return DoDot(n, dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - true); +void Xdotc<T>::DoDotc(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + DoDot(n, dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + true); } // ================================================================================================= diff --git a/src/routines/level1/xdotc.hpp b/src/routines/level1/xdotc.hpp index b8cbdaf5..ab7465f5 100644 --- a/src/routines/level1/xdotc.hpp +++ b/src/routines/level1/xdotc.hpp @@ -31,10 +31,10 @@ class Xdotc: public Xdot<T> { Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC"); // Templated-precision implementation of the routine - StatusCode DoDotc(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoDotc(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xdotu.cpp b/src/routines/level1/xdotu.cpp index 0bce70b7..b9d8bcef 100644 --- a/src/routines/level1/xdotu.cpp +++ b/src/routines/level1/xdotu.cpp @@ -28,14 +28,14 @@ Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xdotu<T>::DoDotu(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { - return DoDot(n, dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - false); +void Xdotu<T>::DoDotu(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + DoDot(n, dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + false); } // ================================================================================================= diff --git a/src/routines/level1/xdotu.hpp b/src/routines/level1/xdotu.hpp index b3f73086..cad91c58 100644 --- a/src/routines/level1/xdotu.hpp +++ b/src/routines/level1/xdotu.hpp @@ -31,10 +31,10 @@ class Xdotu: public Xdot<T> { Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU"); // Templated-precision implementation of the routine - StatusCode DoDotu(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoDotu(const size_t n, + const Buffer<T> &dot_buffer, const size_t dot_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xmax.hpp b/src/routines/level1/xmax.hpp index 5a0236f2..2b7a5ae7 100644 --- a/src/routines/level1/xmax.hpp +++ b/src/routines/level1/xmax.hpp @@ -35,10 +35,10 @@ class Xmax: public Xamax<T> { // Forwards to the regular absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. - StatusCode DoMax(const size_t n, - const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { - return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc); + void DoMax(const size_t n, + const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc); } }; diff --git a/src/routines/level1/xmin.hpp b/src/routines/level1/xmin.hpp index 6befec64..47a195ea 100644 --- a/src/routines/level1/xmin.hpp +++ b/src/routines/level1/xmin.hpp @@ -35,10 +35,10 @@ class Xmin: public Xamax<T> { // Forwards to the regular max-absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. - StatusCode DoMin(const size_t n, - const Buffer<unsigned int> &imin_buffer, const size_t imin_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { - return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); + void DoMin(const size_t n, + const Buffer<unsigned int> &imin_buffer, const size_t imin_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); } }; diff --git a/src/routines/level1/xnrm2.cpp b/src/routines/level1/xnrm2.cpp index 97615d8b..eb795498 100644 --- a/src/routines/level1/xnrm2.cpp +++ b/src/routines/level1/xnrm2.cpp @@ -32,61 +32,52 @@ Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xnrm2<T>::DoNrm2(const size_t n, - const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xnrm2<T>::DoNrm2(const size_t n, + const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorScalar(1, nrm2_buffer, nrm2_offset); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorScalar(1, nrm2_buffer, nrm2_offset); // Retrieves the Xnrm2 kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xnrm2"); - auto kernel2 = Kernel(program, "Xnrm2Epilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer = Buffer<T>(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast<int>(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast<int>(x_offset)); - kernel1.SetArgument(3, static_cast<int>(x_inc)); - kernel1.SetArgument(4, temp_buffer()); - - // Event waiting list - auto eventWaitList = std::vector<Event>(); - - // Launches the main kernel - auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; - auto local1 = std::vector<size_t>{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer()); - kernel2.SetArgument(1, nrm2_buffer()); - kernel2.SetArgument(2, static_cast<int>(nrm2_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector<size_t>{db_["WGS2"]}; - auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel1 = Kernel(program, "Xnrm2"); + auto kernel2 = Kernel(program, "Xnrm2Epilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer<T>(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast<int>(x_offset)); + kernel1.SetArgument(3, static_cast<int>(x_inc)); + kernel1.SetArgument(4, temp_buffer()); + + // Event waiting list + auto eventWaitList = std::vector<Event>(); + + // Launches the main kernel + auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; + auto local1 = std::vector<size_t>{db_["WGS1"]}; + auto kernelEvent = Event(); + RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, nrm2_buffer()); + kernel2.SetArgument(2, static_cast<int>(nrm2_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector<size_t>{db_["WGS2"]}; + auto local2 = std::vector<size_t>{db_["WGS2"]}; + RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); } // ================================================================================================= diff --git a/src/routines/level1/xnrm2.hpp b/src/routines/level1/xnrm2.hpp index 7baf07f5..3183ce24 100644 --- a/src/routines/level1/xnrm2.hpp +++ b/src/routines/level1/xnrm2.hpp @@ -28,9 +28,9 @@ class Xnrm2: public Routine { Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2"); // Templated-precision implementation of the routine - StatusCode DoNrm2(const size_t n, - const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoNrm2(const size_t n, + const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp index bcc43c3b..ed126879 100644 --- a/src/routines/level1/xscal.cpp +++ b/src/routines/level1/xscal.cpp @@ -33,15 +33,14 @@ Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xscal<T>::DoScal(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { +void Xscal<T>::DoScal(const size_t n, const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vector for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && @@ -51,41 +50,35 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha, auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal"; // Retrieves the Xscal kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, alpha); - kernel.SetArgument(2, x_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, alpha); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast<int>(x_offset)); - kernel.SetArgument(4, static_cast<int>(x_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast<int>(x_offset)); + kernel.SetArgument(4, static_cast<int>(x_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } } // ================================================================================================= diff --git a/src/routines/level1/xscal.hpp b/src/routines/level1/xscal.hpp index 6c585cb2..02c847cc 100644 --- a/src/routines/level1/xscal.hpp +++ b/src/routines/level1/xscal.hpp @@ -28,8 +28,8 @@ class Xscal: public Routine { Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL"); // Templated-precision implementation of the routine - StatusCode DoScal(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + void DoScal(const size_t n, const T alpha, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); }; // ================================================================================================= diff --git a/src/routines/level1/xsum.hpp b/src/routines/level1/xsum.hpp index 84e20bea..a69d6511 100644 --- a/src/routines/level1/xsum.hpp +++ b/src/routines/level1/xsum.hpp @@ -35,10 +35,10 @@ class Xsum: public Xasum<T> { // Forwards to the regular absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. - StatusCode DoSum(const size_t n, - const Buffer<T> &sum_buffer, const size_t sum_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { - return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc); + void DoSum(const size_t n, + const Buffer<T> &sum_buffer, const size_t sum_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc); } }; diff --git a/src/routines/level1/xswap.cpp b/src/routines/level1/xswap.cpp index 03907cbd..2f2c0370 100644 --- a/src/routines/level1/xswap.cpp +++ b/src/routines/level1/xswap.cpp @@ -33,18 +33,16 @@ Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> -StatusCode Xswap<T>::DoSwap(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { +void Xswap<T>::DoSwap(const size_t n, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } + TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && @@ -55,43 +53,37 @@ StatusCode Xswap<T>::DoSwap(const size_t n, auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap"; // Retrieves the Xswap kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, static_cast<int>(x_offset)); - kernel.SetArgument(3, static_cast<int>(x_inc)); - kernel.SetArgument(4, y_buffer()); - kernel.SetArgument(5, static_cast<int>(y_offset)); - kernel.SetArgument(6, static_cast<int>(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; - auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, static_cast<int>(x_offset)); + kernel.SetArgument(3, static_cast<int>(x_inc)); + kernel.SetArgument(4, y_buffer()); + kernel.SetArgument(5, static_cast<int>(y_offset)); + kernel.SetArgument(6, static_cast<int>(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } } // ================================================================================================= diff --git a/src/routines/level1/xswap.hpp b/src/routines/level1/xswap.hpp index 4f9ea36d..eadf58e5 100644 --- a/src/routines/level1/xswap.hpp +++ b/src/routines/level1/xswap.hpp @@ -28,9 +28,9 @@ class Xswap: public Routine { Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP"); // Templated-precision implementation of the routine - StatusCode DoSwap(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + void DoSwap(const size_t n, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); }; // ================================================================================================= |