diff options
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | include/internal/routines/xsyrk.h | 49 | ||||
-rw-r--r-- | src/clblast.cc | 4 | ||||
-rw-r--r-- | src/routines/xsyrk.cc | 147 | ||||
-rw-r--r-- | test/correctness/routines/xsyrk.cc | 96 | ||||
-rw-r--r-- | test/performance/graphs/xsyrk.r | 94 | ||||
-rw-r--r-- | test/performance/routines/xsyrk.cc | 113 |
7 files changed, 501 insertions, 4 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index b84ed62b..a8e756e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,7 +99,7 @@ set(ROUTINES_XY xaxpy) set(ROUTINES_AXY xgemv) set(ROUTINES_ABC xgemm xsymm) set(ROUTINES_AB ) -set(ROUTINES_AC ) +set(ROUTINES_AC xsyrk) set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC} ${ROUTINES_AB} ${ROUTINES_AC}) # ================================================================================================== diff --git a/include/internal/routines/xsyrk.h b/include/internal/routines/xsyrk.h new file mode 100644 index 00000000..3dab731f --- /dev/null +++ b/include/internal/routines/xsyrk.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xsyrk routine. The precision is implemented using a template argument. +// The implementation is based on the regular Xgemm routine and kernel, but with two main changes: +// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part. +// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for +// performance reasons, as the actual masking is done later (see the first point). +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYRK_H_ +#define CLBLAST_ROUTINES_XSYRK_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xsyrk: public Routine { + public: + Xsyrk(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYRK_H_ +#endif diff --git a/src/clblast.cc b/src/clblast.cc index e0d085a9..13dfb50f 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -26,6 +26,7 @@ // BLAS level-3 includes #include "internal/routines/xgemm.h" #include "internal/routines/xsymm.h" +#include "internal/routines/xsyrk.h" namespace clblast { // ================================================================================================= @@ -254,7 +255,6 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose tr cl_command_queue* queue, cl_event* event) { auto queue_cpp = CommandQueue(*queue); auto event_cpp = Event(*event); - /* auto routine = Xsyrk<T>(queue_cpp, event_cpp); // Loads the kernel source-code as an include (C++11 raw string literal) @@ -276,8 +276,6 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose tr return routine.DoSyrk(layout, triangle, transpose_a, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, beta, Buffer(c_buffer), c_offset, c_ld); - */ - return StatusCode::kSuccess; } template StatusCode Syrk<float>(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, diff --git a/src/routines/xsyrk.cc b/src/routines/xsyrk.cc new file mode 100644 index 00000000..1f645fd5 --- /dev/null +++ b/src/routines/xsyrk.cc @@ -0,0 +1,147 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xsyrk class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/xsyrk.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle; +template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble; +template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle; +template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + + // In case of complex data-types, the transpose can also become a conjugate transpose + auto a_conjugate = (a_transpose == Transpose::kConjugate); + + // Computes the first and second dimensions of the A matrix taking the layout into account + auto a_one = (a_rotated) ? k : n; + auto a_two = (a_rotated) ? n : k; + + // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernel. This transposes the matrices A and B, but also pads zeros to + // fill them up until they reach a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a, + a_rotated, a_conjugate, true, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n_ceiled)); + kernel.SetArgument(1, static_cast<int>(k_ceiled)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, beta); + kernel.SetArgument(4, temp_a()); + kernel.SetArgument(5, temp_a()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector<size_t>{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyrk<float>; +template class Xsyrk<double>; +template class Xsyrk<float2>; +template class Xsyrk<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/test/correctness/routines/xsyrk.cc b/test/correctness/routines/xsyrk.cc new file mode 100644 index 00000000..8d3bd82e --- /dev/null +++ b/test/correctness/routines/xsyrk.cc @@ -0,0 +1,96 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the tests for the Xsyrk routine. It is based on the TestAC class. +// +// ================================================================================================= + +#include "wrapper_clblas.h" +#include "correctness/testac.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison. +template <typename T> +void XsyrkTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates the CLBlast lambda + auto clblast_lambda = [](const Arguments<T> &args, + const Buffer &a_mat, const Buffer &c_mat, + CommandQueue &queue) -> StatusCode { + auto queue_plain = queue(); + auto event = cl_event{}; + return Syrk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + }; + + // Creates the clBLAS lambda (for comparison) + auto clblas_lambda = [](const Arguments<T> &args, + const Buffer &a_mat, const Buffer &c_mat, + CommandQueue &queue) -> StatusCode { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + return static_cast<StatusCode>(status); + }; + + // Initializes the arguments relevant for this routine + auto args = Arguments<T>{}; + const auto options = std::vector<std::string>{kArgN, kArgK, kArgLayout, + kArgTriangle, kArgATransp, + kArgALeadDim, kArgCLeadDim, + kArgAOffset, kArgCOffset}; + + // Creates a tester + TestAC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { + args.layout = layout; + for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) { + args.triangle = triangle; + for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is + args.a_transpose = a_transpose; // not supported by clBLAS + const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose); + + // Runs the tests + tester.TestRegular(args, case_name); + tester.TestInvalidBufferSizes(args, case_name); + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::XsyrkTest<float>(argc, argv, false, "SSYRK"); + clblast::XsyrkTest<double>(argc, argv, true, "DSYRK"); + clblast::XsyrkTest<clblast::float2>(argc, argv, true, "CSYRK"); + clblast::XsyrkTest<clblast::double2>(argc, argv, true, "ZSYRK"); + return 0; +} + +// ================================================================================================= diff --git a/test/performance/graphs/xsyrk.r b/test/performance/graphs/xsyrk.r new file mode 100644 index 00000000..fe8598e9 --- /dev/null +++ b/test/performance/graphs/xsyrk.r @@ -0,0 +1,94 @@ + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project uses a tab-size of two spaces and a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren <www.cedricnugteren.nl> +# +# This file implements the performance script for the Xsyrk routine +# +# ================================================================================================== + +# Includes the common functions +args <- commandArgs(trailingOnly = FALSE) +thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)]))) +source(file.path(dirname(thisfile), "common.r")) + +# ================================================================================================== + +# Settings +routine_name <- "xsyrk" +parameters <- c("-n","-k","-layout","-triangle","-transA", + "-num_steps","-step","-runs","-precision") +precision <- 32 + +# Sets the names of the test-cases +test_names <- list( + "multiples of 128", + "multiples of 128 (+1)", + "around n=k=512", + "around n=k=2048", + "layouts and transposing (n=k=1024)", + "powers of 2" +) + +# Defines the test-cases +test_values <- list( + list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)), + list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)), + list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)), + list(c(2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)), + list( + c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), + c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), + c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), + c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), + c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + ), + list( + c(8, 8, 0, 0, 0, 1, 0, num_runs, precision), + c(16, 16, 0, 0, 0, 1, 0, num_runs, precision), + c(32, 32, 0, 0, 0, 1, 0, num_runs, precision), + c(64, 64, 0, 0, 0, 1, 0, num_runs, precision), + c(128, 128, 0, 0, 0, 1, 0, num_runs, precision), + c(256, 256, 0, 0, 0, 1, 0, num_runs, precision), + c(512, 512, 0, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), + c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision), + c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision), + c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision) + ) +) + +# Defines the x-labels corresponding to the test-cases +test_xlabels <- list( + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "layout (row/col), triangle (u/l), transA (n/y)", + "matrix sizes (n=k)" +) + +# Defines the x-axis of the test-cases +test_xaxis <- list( + c("n", ""), + c("n", ""), + c("n", ""), + c("n", ""), + list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y", + "col,u,n", "col,u,y", "col,l,n", "col,l,y")), + c("n", "x") +) + +# ================================================================================================== + +# Start the script +main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values, + test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE) + +# ==================================================================================================
\ No newline at end of file diff --git a/test/performance/routines/xsyrk.cc b/test/performance/routines/xsyrk.cc new file mode 100644 index 00000000..f36d665a --- /dev/null +++ b/test/performance/routines/xsyrk.cc @@ -0,0 +1,113 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xsyrk command-line interface tester. +// +// ================================================================================================= + +#include <string> +#include <vector> +#include <exception> + +#include "wrapper_clblas.h" +#include "performance/client.h" + +namespace clblast { +// ================================================================================================= + +// The client, used for performance testing. It contains the function calls to CLBlast and to other +// libraries to compare against. +template <typename T> +void PerformanceXsyrk(const Arguments<T> &args, + const Buffer &a_mat, const Buffer &c_mat, + CommandQueue &queue) { + + // Creates the CLBlast lambda + auto clblast_lambda = [&args, &a_mat, &c_mat, &queue]() { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syrk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + if (status != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status))); + } + }; + + // Creates the clBLAS lambda (for comparison) + auto clblas_lambda = [&args, &a_mat, &c_mat, &queue]() { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + if (status != CL_SUCCESS) { + throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status))); + } + }; + + // Runs the routines and collect the timings + auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda); + auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda); + + // Prints the performance of both libraries + const auto flops = args.n * args.n * args.k; + const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T); + const auto output_ints = std::vector<size_t>{args.n, args.k, + static_cast<size_t>(args.layout), + static_cast<size_t>(args.triangle), + static_cast<size_t>(args.a_transpose), + args.a_ld, args.c_ld, + args.a_offset, args.c_offset}; + const auto output_strings = std::vector<std::string>{ToString(args.alpha), + ToString(args.beta)}; + PrintTableRow(output_ints, output_strings, args.no_abbrv, + ms_clblast, ms_clblas, flops, bytes); +} + +// ================================================================================================= + +// Main function which calls the common client code with the routine-specific function as argument. +void ClientXsyrk(int argc, char *argv[]) { + const auto o = std::vector<std::string>{kArgN, kArgK, + kArgLayout, kArgTriangle, kArgATransp, + kArgALeadDim, kArgCLeadDim, + kArgAOffset, kArgCOffset, + kArgAlpha, kArgBeta}; + switch(GetPrecision(argc, argv)) { + case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: ClientAC<float>(argc, argv, PerformanceXsyrk<float>, o); break; + case Precision::kDouble: ClientAC<double>(argc, argv, PerformanceXsyrk<double>, o); break; + case Precision::kComplexSingle: ClientAC<float2>(argc, argv, PerformanceXsyrk<float2>, o); break; + case Precision::kComplexDouble: ClientAC<double2>(argc, argv, PerformanceXsyrk<double2>, o); break; + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::ClientXsyrk(argc, argv); + return 0; +} + +// ================================================================================================= |