From aa852bbe67a7dc9018afd7d1349184f0284d215c Mon Sep 17 00:00:00 2001 From: CNugteren Date: Sun, 12 Jul 2015 16:57:09 +0200 Subject: Added subfolders for the level1/2/3 routines --- src/routines/level1/xaxpy.cc | 115 +++++++++++++++++++++++++++ src/routines/level2/xgemv.cc | 146 ++++++++++++++++++++++++++++++++++ src/routines/level3/xgemm.cc | 172 ++++++++++++++++++++++++++++++++++++++++ src/routines/level3/xhemm.cc | 130 ++++++++++++++++++++++++++++++ src/routines/level3/xher2k.cc | 178 ++++++++++++++++++++++++++++++++++++++++++ src/routines/level3/xherk.cc | 156 ++++++++++++++++++++++++++++++++++++ src/routines/level3/xsymm.cc | 132 +++++++++++++++++++++++++++++++ src/routines/level3/xsyr2k.cc | 166 +++++++++++++++++++++++++++++++++++++++ src/routines/level3/xsyrk.cc | 147 ++++++++++++++++++++++++++++++++++ src/routines/level3/xtrmm.cc | 135 ++++++++++++++++++++++++++++++++ src/routines/xaxpy.cc | 115 --------------------------- src/routines/xgemm.cc | 172 ---------------------------------------- src/routines/xgemv.cc | 146 ---------------------------------- src/routines/xhemm.cc | 130 ------------------------------ src/routines/xher2k.cc | 178 ------------------------------------------ src/routines/xherk.cc | 156 ------------------------------------ src/routines/xsymm.cc | 132 ------------------------------- src/routines/xsyr2k.cc | 166 --------------------------------------- src/routines/xsyrk.cc | 147 ---------------------------------- src/routines/xtrmm.cc | 135 -------------------------------- 20 files changed, 1477 insertions(+), 1477 deletions(-) create mode 100644 src/routines/level1/xaxpy.cc create mode 100644 src/routines/level2/xgemv.cc create mode 100644 src/routines/level3/xgemm.cc create mode 100644 src/routines/level3/xhemm.cc create mode 100644 src/routines/level3/xher2k.cc create mode 100644 src/routines/level3/xherk.cc create mode 100644 src/routines/level3/xsymm.cc create mode 100644 src/routines/level3/xsyr2k.cc create mode 100644 src/routines/level3/xsyrk.cc create mode 100644 src/routines/level3/xtrmm.cc delete mode 100644 src/routines/xaxpy.cc delete mode 100644 src/routines/xgemm.cc delete mode 100644 src/routines/xgemv.cc delete mode 100644 src/routines/xhemm.cc delete mode 100644 src/routines/xher2k.cc delete mode 100644 src/routines/xherk.cc delete mode 100644 src/routines/xsymm.cc delete mode 100644 src/routines/xsyr2k.cc delete mode 100644 src/routines/xsyrk.cc delete mode 100644 src/routines/xtrmm.cc (limited to 'src/routines') diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc new file mode 100644 index 00000000..fba36851 --- /dev/null +++ b/src/routines/level1/xaxpy.cc @@ -0,0 +1,115 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xaxpy class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xaxpy.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xaxpy::precision_ = Precision::kSingle; +template <> const Precision Xaxpy::precision_ = Precision::kDouble; +template <> const Precision Xaxpy::precision_ = Precision::kComplexSingle; +template <> const Precision Xaxpy::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xaxpy::Xaxpy(CommandQueue &queue, Event &event): + Routine(queue, event, {"Xaxpy"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + (y_offset == 0) && (y_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; + + // Retrieves the Xaxpy kernel from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast(x_offset)); + kernel.SetArgument(4, static_cast(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, static_cast(y_offset)); + kernel.SetArgument(7, static_cast(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xaxpy; +template class Xaxpy; +template class Xaxpy; +template class Xaxpy; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc new file mode 100644 index 00000000..181337b6 --- /dev/null +++ b/src/routines/level2/xgemv.cc @@ -0,0 +1,146 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgemv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xgemv.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xgemv::precision_ = Precision::kSingle; +template <> const Precision Xgemv::precision_ = Precision::kDouble; +template <> const Precision Xgemv::precision_ = Precision::kComplexSingle; +template <> const Precision Xgemv::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xgemv::Xgemv(CommandQueue &queue, Event &event): + Routine(queue, event, {"Xgemv"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrix has an alternative layout (row or column-major). + auto a_altlayout = (layout == Layout::kRowMajor); + auto a_one = (a_altlayout) ? n : m; + auto a_two = (a_altlayout) ? m : n; + + // Swap m and n if the matrix is transposed + auto a_transposed = (a_transpose != Transpose::kNo); + auto m_real = (a_transposed) ? n : m; + auto n_real = (a_transposed) ? m : n; + + // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator) + auto a_rotated = a_transposed ^ a_altlayout; + + // In case of complex data-types, the transpose can also become a conjugate transpose + auto a_conjugate = (a_transpose == Transpose::kConjugate); + + // Tests the matrix and the vectors for validity + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && + IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && + IsMultiple(n, db_["WGS2"]) && + IsMultiple(a_ld, db_["VW2"]); + bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && + IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && + IsMultiple(n, db_["WGS3"]) && + IsMultiple(a_ld, db_["VW3"]); + + // If possible, run the fast-version (rotated or non-rotated) of the kernel + auto kernel_name = "Xgemv"; + auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]); + auto global_size = m_ceiled / db_["WPT1"]; + auto local_size = db_["WGS1"]; + if (use_fast_kernel) { + kernel_name = "XgemvFast"; + global_size = m_real / db_["WPT2"]; + local_size = db_["WGS2"]; + } + if (use_fast_kernel_rot) { + kernel_name = "XgemvFastRot"; + global_size = m_real / db_["WPT3"]; + local_size = db_["WGS3"]; + } + + // Retrieves the Xgemv kernel from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(m_real)); + kernel.SetArgument(1, static_cast(n_real)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, beta); + kernel.SetArgument(4, static_cast(a_rotated)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast(a_offset)); + kernel.SetArgument(7, static_cast(a_ld)); + kernel.SetArgument(8, x_buffer()); + kernel.SetArgument(9, static_cast(x_offset)); + kernel.SetArgument(10, static_cast(x_inc)); + kernel.SetArgument(11, y_buffer()); + kernel.SetArgument(12, static_cast(y_offset)); + kernel.SetArgument(13, static_cast(y_inc)); + kernel.SetArgument(14, static_cast(a_conjugate)); + + // Launches the kernel + auto global = std::vector{global_size}; + auto local = std::vector{local_size}; + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgemv; +template class Xgemv; +template class Xgemv; +template class Xgemv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc new file mode 100644 index 00000000..f4a9f737 --- /dev/null +++ b/src/routines/level3/xgemm.cc @@ -0,0 +1,172 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgemm class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xgemm.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xgemm::precision_ = Precision::kSingle; +template <> const Precision Xgemm::precision_ = Precision::kDouble; +template <> const Precision Xgemm::precision_ = Precision::kComplexSingle; +template <> const Precision Xgemm::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xgemm::Xgemm(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xgemm::DoGemm(const Layout layout, + const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. Note + // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of + // col-major) to be transformed, so transposing requirements are not the same as whether or not + // the matrix is actually transposed in memory. + auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); + auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + auto a_do_transpose = a_rotated; + auto b_do_transpose = !b_rotated; + auto c_do_transpose = c_rotated; + + // In case of complex data-types, the transpose can also become a conjugate transpose + auto a_conjugate = (a_transpose == Transpose::kConjugate); + auto b_conjugate = (b_transpose == Transpose::kConjugate); + + // Computes the first and second dimensions of the 3 matrices taking into account whether the + // matrices are rotated or not + auto a_one = (a_rotated) ? k : m; + auto a_two = (a_rotated) ? m : k; + auto b_one = (b_rotated) ? n : k; + auto b_two = (b_rotated) ? k : n; + auto c_one = (c_rotated) ? n : m; + auto c_two = (c_rotated) ? m : n; + + // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than K when rotated, or less than M when not-rotated + // matrix B cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N when rotated, or less than M when not-rotated + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of m, n, and k + auto m_ceiled = Ceil(m, db_["MWG"]); + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T)); + auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill + // them up until they reach a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + m_ceiled, k_ceiled, m_ceiled, 0, temp_a, + a_do_transpose, a_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b, + b_do_transpose, b_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Only necessary for matrix C if it used both as input and output + if (beta != static_cast(0)) { + status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer, + m_ceiled, n_ceiled, m_ceiled, 0, temp_c, + c_do_transpose, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + } + + // Retrieves the Xgemm kernel from the compiled binary + try { + auto kernel = Kernel(program, "Xgemm"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(m_ceiled)); + kernel.SetArgument(1, static_cast(n_ceiled)); + kernel.SetArgument(2, static_cast(k_ceiled)); + kernel.SetArgument(3, alpha); + kernel.SetArgument(4, beta); + kernel.SetArgument(5, temp_a()); + kernel.SetArgument(6, temp_b()); + kernel.SetArgument(7, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (m_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c, + c_one, c_two, c_ld, c_offset, c_buffer, + c_do_transpose, false, false, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgemm; +template class Xgemm; +template class Xgemm; +template class Xgemm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc new file mode 100644 index 00000000..bc257c44 --- /dev/null +++ b/src/routines/level3/xhemm.cc @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhemm class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xhemm.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xhemm::Xhemm(CommandQueue &queue, Event &event): + Xgemm(queue, event) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xhemm::DoHemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the + // left) or B (on the right) in the Xgemm routine. + auto k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the squared A matrix + auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as + // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix + bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared"; + + // Temporary buffer for a copy of the hermitian matrix + try { + auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + + // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm + // routine afterwards + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the hermitian-to-squared kernel + kernel.SetArgument(0, static_cast(k)); + kernel.SetArgument(1, static_cast(a_ld)); + kernel.SetArgument(2, static_cast(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast(k)); + kernel.SetArgument(5, static_cast(k)); + kernel.SetArgument(6, static_cast(0)); + kernel.SetArgument(7, temp_herm()); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // hermitian-to-squared kernel uses the same parameters. + auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the regular Xgemm code with either "C := AB+C" or ... + if (side == Side::kLeft) { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + temp_herm, 0, k, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld); + } + + // ... with "C := BA+C". Note that A and B are now reversed. + else { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_herm, 0, k, + beta, + c_buffer, c_offset, c_ld); + + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(status) { + case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; + case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; + case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; + case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; + case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; + case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; + } + } + + // Return the status of the Xgemm routine + return status; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhemm; +template class Xhemm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc new file mode 100644 index 00000000..6d33a0e1 --- /dev/null +++ b/src/routines/level3/xher2k.cc @@ -0,0 +1,178 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xher2k class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xher2k.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xher2k::precision_ = Precision::kComplexSingle; +template <> const Precision Xher2k::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xher2k::Xher2k(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or + // to matrix A (argument: conjugate transpose) + auto ab_conjugate = (ab_transpose != Transpose::kNo); + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) || + (layout == Layout::kRowMajor && !ab_conjugate); + auto c_rotated = (layout == Layout::kRowMajor); + + // Computes the first and second dimensions of the A and B matrices taking the layout into account + auto ab_one = (ab_rotated) ? k : n; + auto ab_two = (ab_rotated) ? n : k; + + // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix B cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_b1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_a2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_b2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to + // fill them up until they reach a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a1, + ab_rotated, ab_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a2, + ab_rotated, !ab_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b1, + ab_rotated, ab_conjugate, true, false, false, false, program); + status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b2, + ab_rotated, !ab_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + auto complex_beta = T{beta, static_cast(0.0)}; + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, complex_beta); + kernel.SetArgument(4, temp_a1()); + kernel.SetArgument(5, temp_b2()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha + auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; + auto complex_one = T{static_cast(1.0), static_cast(0.0)}; + kernel.SetArgument(2, conjugate_alpha); + kernel.SetArgument(3, complex_one); + kernel.SetArgument(4, temp_b1()); + kernel.SetArgument(5, temp_a2()); + + // Runs the kernel again + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, true, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xher2k; +template class Xher2k; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc new file mode 100644 index 00000000..8fae294f --- /dev/null +++ b/src/routines/level3/xherk.cc @@ -0,0 +1,156 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xherk class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xherk.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xherk::precision_ = Precision::kComplexSingle; +template <> const Precision Xherk::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xherk::Xherk(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const U alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const U beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or + // to matrix A (argument: conjugate transpose) + auto a_conjugate = (a_transpose != Transpose::kNo); + auto b_conjugate = (a_transpose == Transpose::kNo); + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto a_rotated = (layout == Layout::kColMajor && a_conjugate) || + (layout == Layout::kRowMajor && !a_conjugate); + auto c_rotated = (layout == Layout::kRowMajor); + + // Computes the first and second dimensions of the A matrix taking the layout into account + auto a_one = (a_rotated) ? k : n; + auto a_two = (a_rotated) ? n : k; + + // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to + // fill it up until it reaches a certain multiple of size (kernel parameter dependent). It + // creates two copies: + status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a, + a_rotated, a_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b, + a_rotated, b_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + auto complex_alpha = T{alpha, static_cast(0.0)}; + auto complex_beta = T{beta, static_cast(0.0)}; + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, complex_alpha); + kernel.SetArgument(3, complex_beta); + kernel.SetArgument(4, temp_a()); + kernel.SetArgument(5, temp_b()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, true, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xherk; +template class Xherk; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc new file mode 100644 index 00000000..1d17f0eb --- /dev/null +++ b/src/routines/level3/xsymm.cc @@ -0,0 +1,132 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsymm class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xsymm.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsymm::Xsymm(CommandQueue &queue, Event &event): + Xgemm(queue, event) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsymm::DoSymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the + // left) or B (on the right) in the Xgemm routine. + auto k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the squared A matrix + auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as + // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix + bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; + + // Temporary buffer for a copy of the symmetric matrix + try { + auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + + // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm + // routine afterwards + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the symmetric-to-squared kernel + kernel.SetArgument(0, static_cast(k)); + kernel.SetArgument(1, static_cast(a_ld)); + kernel.SetArgument(2, static_cast(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast(k)); + kernel.SetArgument(5, static_cast(k)); + kernel.SetArgument(6, static_cast(0)); + kernel.SetArgument(7, temp_symm()); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // symmetric-to-squared kernel uses the same parameters. + auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the regular Xgemm code with either "C := AB+C" or ... + if (side == Side::kLeft) { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + temp_symm, 0, k, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld); + } + + // ... with "C := BA+C". Note that A and B are now reversed. + else { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_symm, 0, k, + beta, + c_buffer, c_offset, c_ld); + + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(status) { + case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; + case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; + case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; + case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; + case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; + case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; + } + } + + // Return the status of the Xgemm routine + return status; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsymm; +template class Xsymm; +template class Xsymm; +template class Xsymm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc new file mode 100644 index 00000000..d54f2fc1 --- /dev/null +++ b/src/routines/level3/xsyr2k.cc @@ -0,0 +1,166 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2k class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xsyr2k.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xsyr2k::precision_ = Precision::kSingle; +template <> const Precision Xsyr2k::precision_ = Precision::kDouble; +template <> const Precision Xsyr2k::precision_ = Precision::kComplexSingle; +template <> const Precision Xsyr2k::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsyr2k::Xsyr2k(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + + // Computes the first and second dimensions of the A and B matrices taking the layout into account + auto ab_one = (ab_rotated) ? k : n; + auto ab_two = (ab_rotated) ? n : k; + + // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix B cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to + // fill them up until they reach a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a, + ab_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b, + ab_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, beta); + kernel.SetArgument(4, temp_a()); + kernel.SetArgument(5, temp_b()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Swaps the arguments for matrices A and B, and sets 'beta' to 1 + auto one = static_cast(1); + kernel.SetArgument(3, one); + kernel.SetArgument(4, temp_b()); + kernel.SetArgument(5, temp_a()); + + // Runs the kernel again + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, false, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyr2k; +template class Xsyr2k; +template class Xsyr2k; +template class Xsyr2k; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc new file mode 100644 index 00000000..bb952410 --- /dev/null +++ b/src/routines/level3/xsyrk.cc @@ -0,0 +1,147 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyrk class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xsyrk.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xsyrk::precision_ = Precision::kSingle; +template <> const Precision Xsyrk::precision_ = Precision::kDouble; +template <> const Precision Xsyrk::precision_ = Precision::kComplexSingle; +template <> const Precision Xsyrk::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsyrk::Xsyrk(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + + // Computes the first and second dimensions of the A matrix taking the layout into account + auto a_one = (a_rotated) ? k : n; + auto a_two = (a_rotated) ? n : k; + + // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to + // fill it up until it reaches a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a, + a_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, beta); + kernel.SetArgument(4, temp_a()); + kernel.SetArgument(5, temp_a()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, false, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyrk; +template class Xsyrk; +template class Xsyrk; +template class Xsyrk; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc new file mode 100644 index 00000000..52f272e3 --- /dev/null +++ b/src/routines/level3/xtrmm.cc @@ -0,0 +1,135 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrmm class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xtrmm.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xtrmm::Xtrmm(CommandQueue &queue, Event &event): + Xgemm(queue, event) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } + + // Computes the k dimension. This is based on whether or not matrix is A (on the left) + // or B (on the right) in the Xgemm routine. + auto k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the triangular A matrix + auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as + // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix + bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared"; + + // Determines whether or not the triangular matrix is unit-diagonal + auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false; + + // Temporary buffer for a copy of the triangular matrix + try { + auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + + // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm + // routine afterwards + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the triangular-to-squared kernel + kernel.SetArgument(0, static_cast(k)); + kernel.SetArgument(1, static_cast(a_ld)); + kernel.SetArgument(2, static_cast(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast(k)); + kernel.SetArgument(5, static_cast(k)); + kernel.SetArgument(6, static_cast(0)); + kernel.SetArgument(7, temp_triangular()); + kernel.SetArgument(8, static_cast(unit_diagonal)); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // triangular-to-squared kernel uses the same parameters. + auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the regular Xgemm code with either "B := alpha*A*B" or ... + if (side == Side::kLeft) { + status = DoGemm(layout, a_transpose, Transpose::kNo, + m, n, k, + alpha, + temp_triangular, 0, k, + b_buffer, b_offset, b_ld, + static_cast(0.0), + b_buffer, b_offset, b_ld); + } + + // ... with "B := alpha*B*A". Note that A and B are now reversed. + else { + status = DoGemm(layout, Transpose::kNo, a_transpose, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_triangular, 0, k, + static_cast(0.0), + b_buffer, b_offset, b_ld); + + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(status) { + case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; + case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; + case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; + case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; + case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; + case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; + } + } + + // Return the status of the Xgemm routine + return status; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtrmm; +template class Xtrmm; +template class Xtrmm; +template class Xtrmm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/xaxpy.cc b/src/routines/xaxpy.cc deleted file mode 100644 index b68458da..00000000 --- a/src/routines/xaxpy.cc +++ /dev/null @@ -1,115 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xaxpy class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xaxpy.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xaxpy::precision_ = Precision::kSingle; -template <> const Precision Xaxpy::precision_ = Precision::kDouble; -template <> const Precision Xaxpy::precision_ = Precision::kComplexSingle; -template <> const Precision Xaxpy::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xaxpy::Xaxpy(CommandQueue &queue, Event &event): - Routine(queue, event, {"Xaxpy"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines whether or not the fast-version can be used - bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && - (y_offset == 0) && (y_inc == 1) && - IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); - - // If possible, run the fast-version of the kernel - auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; - - // Retrieves the Xaxpy kernel from the compiled binary - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast(x_offset)); - kernel.SetArgument(4, static_cast(x_inc)); - kernel.SetArgument(5, y_buffer()); - kernel.SetArgument(6, static_cast(y_offset)); - kernel.SetArgument(7, static_cast(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector{n_ceiled/db_["WPT"]}; - auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local); - } - if (ErrorIn(status)) { return status; } - - // Waits for all kernels to finish - queue_.Finish(); - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xaxpy; -template class Xaxpy; -template class Xaxpy; -template class Xaxpy; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xgemm.cc b/src/routines/xgemm.cc deleted file mode 100644 index c8674282..00000000 --- a/src/routines/xgemm.cc +++ /dev/null @@ -1,172 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgemm class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xgemm.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xgemm::precision_ = Precision::kSingle; -template <> const Precision Xgemm::precision_ = Precision::kDouble; -template <> const Precision Xgemm::precision_ = Precision::kComplexSingle; -template <> const Precision Xgemm::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xgemm::Xgemm(CommandQueue &queue, Event &event): - Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xgemm::DoGemm(const Layout layout, - const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. Note - // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of - // col-major) to be transformed, so transposing requirements are not the same as whether or not - // the matrix is actually transposed in memory. - auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); - auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); - auto c_rotated = (layout == Layout::kRowMajor); - auto a_do_transpose = a_rotated; - auto b_do_transpose = !b_rotated; - auto c_do_transpose = c_rotated; - - // In case of complex data-types, the transpose can also become a conjugate transpose - auto a_conjugate = (a_transpose == Transpose::kConjugate); - auto b_conjugate = (b_transpose == Transpose::kConjugate); - - // Computes the first and second dimensions of the 3 matrices taking into account whether the - // matrices are rotated or not - auto a_one = (a_rotated) ? k : m; - auto a_two = (a_rotated) ? m : k; - auto b_one = (b_rotated) ? n : k; - auto b_two = (b_rotated) ? k : n; - auto c_one = (c_rotated) ? n : m; - auto c_two = (c_rotated) ? m : n; - - // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than K when rotated, or less than M when not-rotated - // matrix B cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N when rotated, or less than M when not-rotated - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of m, n, and k - auto m_ceiled = Ceil(m, db_["MWG"]); - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Allocates space on the device for padded and/or transposed input and output matrices. - try { - auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T)); - auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T)); - - // Loads the program from the database - auto& program = GetProgramFromCache(); - - // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill - // them up until they reach a certain multiple of size (kernel parameter dependent). - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, - m_ceiled, k_ceiled, m_ceiled, 0, temp_a, - a_do_transpose, a_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - b_do_transpose, b_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Only necessary for matrix C if it used both as input and output - if (beta != static_cast(0)) { - status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer, - m_ceiled, n_ceiled, m_ceiled, 0, temp_c, - c_do_transpose, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - } - - // Retrieves the Xgemm kernel from the compiled binary - try { - auto kernel = Kernel(program, "Xgemm"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(m_ceiled)); - kernel.SetArgument(1, static_cast(n_ceiled)); - kernel.SetArgument(2, static_cast(k_ceiled)); - kernel.SetArgument(3, alpha); - kernel.SetArgument(4, beta); - kernel.SetArgument(5, temp_a()); - kernel.SetArgument(6, temp_b()); - kernel.SetArgument(7, temp_c()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (m_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel - status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c, - c_one, c_two, c_ld, c_offset, c_buffer, - c_do_transpose, false, false, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xgemm; -template class Xgemm; -template class Xgemm; -template class Xgemm; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xgemv.cc b/src/routines/xgemv.cc deleted file mode 100644 index 1868dec4..00000000 --- a/src/routines/xgemv.cc +++ /dev/null @@ -1,146 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgemv class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xgemv.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xgemv::precision_ = Precision::kSingle; -template <> const Precision Xgemv::precision_ = Precision::kDouble; -template <> const Precision Xgemv::precision_ = Precision::kComplexSingle; -template <> const Precision Xgemv::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xgemv::Xgemv(CommandQueue &queue, Event &event): - Routine(queue, event, {"Xgemv"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // Makes sure all dimensions are larger than zero - if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrix has an alternative layout (row or column-major). - auto a_altlayout = (layout == Layout::kRowMajor); - auto a_one = (a_altlayout) ? n : m; - auto a_two = (a_altlayout) ? m : n; - - // Swap m and n if the matrix is transposed - auto a_transposed = (a_transpose != Transpose::kNo); - auto m_real = (a_transposed) ? n : m; - auto n_real = (a_transposed) ? m : n; - - // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator) - auto a_rotated = a_transposed ^ a_altlayout; - - // In case of complex data-types, the transpose can also become a conjugate transpose - auto a_conjugate = (a_transpose == Transpose::kConjugate); - - // Tests the matrix and the vectors for validity - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines whether or not the fast-version can be used - bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && - IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && - IsMultiple(n, db_["WGS2"]) && - IsMultiple(a_ld, db_["VW2"]); - bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && - IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && - IsMultiple(n, db_["WGS3"]) && - IsMultiple(a_ld, db_["VW3"]); - - // If possible, run the fast-version (rotated or non-rotated) of the kernel - auto kernel_name = "Xgemv"; - auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]); - auto global_size = m_ceiled / db_["WPT1"]; - auto local_size = db_["WGS1"]; - if (use_fast_kernel) { - kernel_name = "XgemvFast"; - global_size = m_real / db_["WPT2"]; - local_size = db_["WGS2"]; - } - if (use_fast_kernel_rot) { - kernel_name = "XgemvFastRot"; - global_size = m_real / db_["WPT3"]; - local_size = db_["WGS3"]; - } - - // Retrieves the Xgemv kernel from the compiled binary - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(m_real)); - kernel.SetArgument(1, static_cast(n_real)); - kernel.SetArgument(2, alpha); - kernel.SetArgument(3, beta); - kernel.SetArgument(4, static_cast(a_rotated)); - kernel.SetArgument(5, a_buffer()); - kernel.SetArgument(6, static_cast(a_offset)); - kernel.SetArgument(7, static_cast(a_ld)); - kernel.SetArgument(8, x_buffer()); - kernel.SetArgument(9, static_cast(x_offset)); - kernel.SetArgument(10, static_cast(x_inc)); - kernel.SetArgument(11, y_buffer()); - kernel.SetArgument(12, static_cast(y_offset)); - kernel.SetArgument(13, static_cast(y_inc)); - kernel.SetArgument(14, static_cast(a_conjugate)); - - // Launches the kernel - auto global = std::vector{global_size}; - auto local = std::vector{local_size}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Waits for all kernels to finish - queue_.Finish(); - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xgemv; -template class Xgemv; -template class Xgemv; -template class Xgemv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xhemm.cc b/src/routines/xhemm.cc deleted file mode 100644 index 73f769ed..00000000 --- a/src/routines/xhemm.cc +++ /dev/null @@ -1,130 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xhemm class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xhemm.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xhemm::Xhemm(CommandQueue &queue, Event &event): - Xgemm(queue, event) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xhemm::DoHemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } - - // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the - // left) or B (on the right) in the Xgemm routine. - auto k = (side == Side::kLeft) ? m : n; - - // Checks for validity of the squared A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared"; - - // Temporary buffer for a copy of the hermitian matrix - try { - auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); - - // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm - // routine afterwards - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the hermitian-to-squared kernel - kernel.SetArgument(0, static_cast(k)); - kernel.SetArgument(1, static_cast(a_ld)); - kernel.SetArgument(2, static_cast(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast(k)); - kernel.SetArgument(5, static_cast(k)); - kernel.SetArgument(6, static_cast(0)); - kernel.SetArgument(7, temp_herm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // hermitian-to-squared kernel uses the same parameters. - auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the regular Xgemm code with either "C := AB+C" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - temp_herm, 0, k, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld); - } - - // ... with "C := BA+C". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_herm, 0, k, - beta, - c_buffer, c_offset, c_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } - } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xhemm; -template class Xhemm; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xher2k.cc b/src/routines/xher2k.cc deleted file mode 100644 index b19b743b..00000000 --- a/src/routines/xher2k.cc +++ /dev/null @@ -1,178 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xher2k class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xher2k.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xher2k::precision_ = Precision::kComplexSingle; -template <> const Precision Xher2k::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xher2k::Xher2k(CommandQueue &queue, Event &event): - Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } - - // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or - // to matrix A (argument: conjugate transpose) - auto ab_conjugate = (ab_transpose != Transpose::kNo); - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. - auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) || - (layout == Layout::kRowMajor && !ab_conjugate); - auto c_rotated = (layout == Layout::kRowMajor); - - // Computes the first and second dimensions of the A and B matrices taking the layout into account - auto ab_one = (ab_rotated) ? k : n; - auto ab_two = (ab_rotated) ? n : k; - - // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than K when not-rotated - // matrix B cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N - auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Decides which kernel to run: the upper-triangular or lower-triangular version - auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - - // Allocates space on the device for padded and/or transposed input and output matrices. - try { - auto temp_a1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_b1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_a2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_b2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); - - // Loads the program from the database - auto& program = GetProgramFromCache(); - - // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to - // fill them up until they reach a certain multiple of size (kernel parameter dependent). - status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_a1, - ab_rotated, ab_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_a2, - ab_rotated, !ab_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_b1, - ab_rotated, ab_conjugate, true, false, false, false, program); - status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_b2, - ab_rotated, !ab_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - auto complex_beta = T{beta, static_cast(0.0)}; - kernel.SetArgument(0, static_cast(n_ceiled)); - kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha); - kernel.SetArgument(3, complex_beta); - kernel.SetArgument(4, temp_a1()); - kernel.SetArgument(5, temp_b2()); - kernel.SetArgument(6, temp_c()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha - auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; - auto complex_one = T{static_cast(1.0), static_cast(0.0)}; - kernel.SetArgument(2, conjugate_alpha); - kernel.SetArgument(3, complex_one); - kernel.SetArgument(4, temp_b1()); - kernel.SetArgument(5, temp_a2()); - - // Runs the kernel again - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, true, program); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xher2k; -template class Xher2k; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xherk.cc b/src/routines/xherk.cc deleted file mode 100644 index 6bc9cd6c..00000000 --- a/src/routines/xherk.cc +++ /dev/null @@ -1,156 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xherk class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xherk.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xherk::precision_ = Precision::kComplexSingle; -template <> const Precision Xherk::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xherk::Xherk(CommandQueue &queue, Event &event): - Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const U alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } - - // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or - // to matrix A (argument: conjugate transpose) - auto a_conjugate = (a_transpose != Transpose::kNo); - auto b_conjugate = (a_transpose == Transpose::kNo); - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. - auto a_rotated = (layout == Layout::kColMajor && a_conjugate) || - (layout == Layout::kRowMajor && !a_conjugate); - auto c_rotated = (layout == Layout::kRowMajor); - - // Computes the first and second dimensions of the A matrix taking the layout into account - auto a_one = (a_rotated) ? k : n; - auto a_two = (a_rotated) ? n : k; - - // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Decides which kernel to run: the upper-triangular or lower-triangular version - auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - - // Allocates space on the device for padded and/or transposed input and output matrices. - try { - auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); - - // Loads the program from the database - auto& program = GetProgramFromCache(); - - // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to - // fill it up until it reaches a certain multiple of size (kernel parameter dependent). It - // creates two copies: - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_a, - a_rotated, a_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - a_rotated, b_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - auto complex_alpha = T{alpha, static_cast(0.0)}; - auto complex_beta = T{beta, static_cast(0.0)}; - kernel.SetArgument(0, static_cast(n_ceiled)); - kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, complex_alpha); - kernel.SetArgument(3, complex_beta); - kernel.SetArgument(4, temp_a()); - kernel.SetArgument(5, temp_b()); - kernel.SetArgument(6, temp_c()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, true, program); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xherk; -template class Xherk; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xsymm.cc b/src/routines/xsymm.cc deleted file mode 100644 index b39eb24d..00000000 --- a/src/routines/xsymm.cc +++ /dev/null @@ -1,132 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsymm class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xsymm.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsymm::Xsymm(CommandQueue &queue, Event &event): - Xgemm(queue, event) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsymm::DoSymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } - - // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the - // left) or B (on the right) in the Xgemm routine. - auto k = (side == Side::kLeft) ? m : n; - - // Checks for validity of the squared A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; - - // Temporary buffer for a copy of the symmetric matrix - try { - auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); - - // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm - // routine afterwards - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the symmetric-to-squared kernel - kernel.SetArgument(0, static_cast(k)); - kernel.SetArgument(1, static_cast(a_ld)); - kernel.SetArgument(2, static_cast(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast(k)); - kernel.SetArgument(5, static_cast(k)); - kernel.SetArgument(6, static_cast(0)); - kernel.SetArgument(7, temp_symm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // symmetric-to-squared kernel uses the same parameters. - auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the regular Xgemm code with either "C := AB+C" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - temp_symm, 0, k, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld); - } - - // ... with "C := BA+C". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_symm, 0, k, - beta, - c_buffer, c_offset, c_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } - } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsymm; -template class Xsymm; -template class Xsymm; -template class Xsymm; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xsyr2k.cc b/src/routines/xsyr2k.cc deleted file mode 100644 index abb8b7eb..00000000 --- a/src/routines/xsyr2k.cc +++ /dev/null @@ -1,166 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsyr2k class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xsyr2k.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xsyr2k::precision_ = Precision::kSingle; -template <> const Precision Xsyr2k::precision_ = Precision::kDouble; -template <> const Precision Xsyr2k::precision_ = Precision::kComplexSingle; -template <> const Precision Xsyr2k::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsyr2k::Xsyr2k(CommandQueue &queue, Event &event): - Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. - auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo); - auto c_rotated = (layout == Layout::kRowMajor); - - // Computes the first and second dimensions of the A and B matrices taking the layout into account - auto ab_one = (ab_rotated) ? k : n; - auto ab_two = (ab_rotated) ? n : k; - - // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than K when not-rotated - // matrix B cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N - auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Decides which kernel to run: the upper-triangular or lower-triangular version - auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - - // Allocates space on the device for padded and/or transposed input and output matrices. - try { - auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); - - // Loads the program from the database - auto& program = GetProgramFromCache(); - - // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to - // fill them up until they reach a certain multiple of size (kernel parameter dependent). - status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_a, - ab_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - ab_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(n_ceiled)); - kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha); - kernel.SetArgument(3, beta); - kernel.SetArgument(4, temp_a()); - kernel.SetArgument(5, temp_b()); - kernel.SetArgument(6, temp_c()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Swaps the arguments for matrices A and B, and sets 'beta' to 1 - auto one = static_cast(1); - kernel.SetArgument(3, one); - kernel.SetArgument(4, temp_b()); - kernel.SetArgument(5, temp_a()); - - // Runs the kernel again - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, false, program); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsyr2k; -template class Xsyr2k; -template class Xsyr2k; -template class Xsyr2k; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xsyrk.cc b/src/routines/xsyrk.cc deleted file mode 100644 index 3efa0598..00000000 --- a/src/routines/xsyrk.cc +++ /dev/null @@ -1,147 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsyrk class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xsyrk.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xsyrk::precision_ = Precision::kSingle; -template <> const Precision Xsyrk::precision_ = Precision::kDouble; -template <> const Precision Xsyrk::precision_ = Precision::kComplexSingle; -template <> const Precision Xsyrk::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsyrk::Xsyrk(CommandQueue &queue, Event &event): - Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. - auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); - auto c_rotated = (layout == Layout::kRowMajor); - - // Computes the first and second dimensions of the A matrix taking the layout into account - auto a_one = (a_rotated) ? k : n; - auto a_two = (a_rotated) ? n : k; - - // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Decides which kernel to run: the upper-triangular or lower-triangular version - auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - - // Allocates space on the device for padded and/or transposed input and output matrices. - try { - auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); - - // Loads the program from the database - auto& program = GetProgramFromCache(); - - // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to - // fill it up until it reaches a certain multiple of size (kernel parameter dependent). - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_a, - a_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(n_ceiled)); - kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha); - kernel.SetArgument(3, beta); - kernel.SetArgument(4, temp_a()); - kernel.SetArgument(5, temp_a()); - kernel.SetArgument(6, temp_c()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, false, program); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsyrk; -template class Xsyrk; -template class Xsyrk; -template class Xsyrk; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xtrmm.cc b/src/routines/xtrmm.cc deleted file mode 100644 index 543df844..00000000 --- a/src/routines/xtrmm.cc +++ /dev/null @@ -1,135 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xtrmm class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xtrmm.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xtrmm::Xtrmm(CommandQueue &queue, Event &event): - Xgemm(queue, event) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } - - // Computes the k dimension. This is based on whether or not matrix is A (on the left) - // or B (on the right) in the Xgemm routine. - auto k = (side == Side::kLeft) ? m : n; - - // Checks for validity of the triangular A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared"; - - // Determines whether or not the triangular matrix is unit-diagonal - auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false; - - // Temporary buffer for a copy of the triangular matrix - try { - auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); - - // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm - // routine afterwards - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the triangular-to-squared kernel - kernel.SetArgument(0, static_cast(k)); - kernel.SetArgument(1, static_cast(a_ld)); - kernel.SetArgument(2, static_cast(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast(k)); - kernel.SetArgument(5, static_cast(k)); - kernel.SetArgument(6, static_cast(0)); - kernel.SetArgument(7, temp_triangular()); - kernel.SetArgument(8, static_cast(unit_diagonal)); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // triangular-to-squared kernel uses the same parameters. - auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the regular Xgemm code with either "B := alpha*A*B" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, a_transpose, Transpose::kNo, - m, n, k, - alpha, - temp_triangular, 0, k, - b_buffer, b_offset, b_ld, - static_cast(0.0), - b_buffer, b_offset, b_ld); - } - - // ... with "B := alpha*B*A". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, a_transpose, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_triangular, 0, k, - static_cast(0.0), - b_buffer, b_offset, b_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } - } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xtrmm; -template class Xtrmm; -template class Xtrmm; -template class Xtrmm; - -// ================================================================================================= -} // namespace clblast -- cgit v1.2.3