From aa852bbe67a7dc9018afd7d1349184f0284d215c Mon Sep 17 00:00:00 2001 From: CNugteren Date: Sun, 12 Jul 2015 16:57:09 +0200 Subject: Added subfolders for the level1/2/3 routines --- CMakeLists.txt | 47 ++++++-- include/internal/routines/level1/xaxpy.h | 42 +++++++ include/internal/routines/level2/xgemv.h | 46 ++++++++ include/internal/routines/level3/xgemm.h | 46 ++++++++ include/internal/routines/level3/xhemm.h | 58 ++++++++++ include/internal/routines/level3/xher2k.h | 48 ++++++++ include/internal/routines/level3/xherk.h | 47 ++++++++ include/internal/routines/level3/xsymm.h | 60 ++++++++++ include/internal/routines/level3/xsyr2k.h | 48 ++++++++ include/internal/routines/level3/xsyrk.h | 49 ++++++++ include/internal/routines/level3/xtrmm.h | 58 ++++++++++ include/internal/routines/xaxpy.h | 42 ------- include/internal/routines/xgemm.h | 46 -------- include/internal/routines/xgemv.h | 46 -------- include/internal/routines/xhemm.h | 58 ---------- include/internal/routines/xher2k.h | 48 -------- include/internal/routines/xherk.h | 47 -------- include/internal/routines/xsymm.h | 60 ---------- include/internal/routines/xsyr2k.h | 48 -------- include/internal/routines/xsyrk.h | 49 -------- include/internal/routines/xtrmm.h | 58 ---------- src/clblast.cc | 20 ++-- src/routines/level1/xaxpy.cc | 115 +++++++++++++++++++ src/routines/level2/xgemv.cc | 146 +++++++++++++++++++++++ src/routines/level3/xgemm.cc | 172 ++++++++++++++++++++++++++++ src/routines/level3/xhemm.cc | 130 +++++++++++++++++++++ src/routines/level3/xher2k.cc | 178 +++++++++++++++++++++++++++++ src/routines/level3/xherk.cc | 156 +++++++++++++++++++++++++ src/routines/level3/xsymm.cc | 132 +++++++++++++++++++++ src/routines/level3/xsyr2k.cc | 166 +++++++++++++++++++++++++++ src/routines/level3/xsyrk.cc | 147 ++++++++++++++++++++++++ src/routines/level3/xtrmm.cc | 135 ++++++++++++++++++++++ src/routines/xaxpy.cc | 115 ------------------- src/routines/xgemm.cc | 172 ---------------------------- src/routines/xgemv.cc | 146 ----------------------- src/routines/xhemm.cc | 130 --------------------- src/routines/xher2k.cc | 178 ----------------------------- src/routines/xherk.cc | 156 ------------------------- src/routines/xsymm.cc | 132 --------------------- src/routines/xsyr2k.cc | 166 --------------------------- src/routines/xsyrk.cc | 147 ------------------------ src/routines/xtrmm.cc | 135 ---------------------- test/correctness/routines/level1/xaxpy.cc | 81 +++++++++++++ test/correctness/routines/level2/xgemv.cc | 99 ++++++++++++++++ test/correctness/routines/level3/xgemm.cc | 102 +++++++++++++++++ test/correctness/routines/level3/xhemm.cc | 98 ++++++++++++++++ test/correctness/routines/level3/xher2k.cc | 100 ++++++++++++++++ test/correctness/routines/level3/xherk.cc | 92 +++++++++++++++ test/correctness/routines/level3/xsymm.cc | 100 ++++++++++++++++ test/correctness/routines/level3/xsyr2k.cc | 102 +++++++++++++++++ test/correctness/routines/level3/xsyrk.cc | 94 +++++++++++++++ test/correctness/routines/level3/xtrmm.cc | 96 ++++++++++++++++ test/correctness/routines/xaxpy.cc | 81 ------------- test/correctness/routines/xgemm.cc | 102 ----------------- test/correctness/routines/xgemv.cc | 99 ---------------- test/correctness/routines/xhemm.cc | 98 ---------------- test/correctness/routines/xher2k.cc | 100 ---------------- test/correctness/routines/xherk.cc | 92 --------------- test/correctness/routines/xsymm.cc | 100 ---------------- test/correctness/routines/xsyr2k.cc | 102 ----------------- test/correctness/routines/xsyrk.cc | 94 --------------- test/correctness/routines/xtrmm.cc | 96 ---------------- test/performance/routines/level1/xaxpy.cc | 40 +++++++ test/performance/routines/level2/xgemv.cc | 40 +++++++ test/performance/routines/level3/xgemm.cc | 40 +++++++ test/performance/routines/level3/xhemm.cc | 40 +++++++ test/performance/routines/level3/xher2k.cc | 40 +++++++ test/performance/routines/level3/xherk.cc | 40 +++++++ test/performance/routines/level3/xsymm.cc | 40 +++++++ test/performance/routines/level3/xsyr2k.cc | 40 +++++++ test/performance/routines/level3/xsyrk.cc | 40 +++++++ test/performance/routines/level3/xtrmm.cc | 40 +++++++ test/performance/routines/xaxpy.cc | 40 ------- test/performance/routines/xgemm.cc | 40 ------- test/performance/routines/xgemv.cc | 40 ------- test/performance/routines/xhemm.cc | 40 ------- test/performance/routines/xher2k.cc | 40 ------- test/performance/routines/xherk.cc | 40 ------- test/performance/routines/xsymm.cc | 40 ------- test/performance/routines/xsyr2k.cc | 40 ------- test/performance/routines/xsyrk.cc | 40 ------- test/performance/routines/xtrmm.cc | 40 ------- test/routines/level1/xaxpy.h | 113 ++++++++++++++++++ test/routines/level2/xgemv.h | 132 +++++++++++++++++++++ test/routines/level3/xgemm.h | 134 ++++++++++++++++++++++ test/routines/level3/xhemm.h | 134 ++++++++++++++++++++++ test/routines/level3/xher2k.h | 132 +++++++++++++++++++++ test/routines/level3/xherk.h | 121 ++++++++++++++++++++ test/routines/level3/xsymm.h | 134 ++++++++++++++++++++++ test/routines/level3/xsyr2k.h | 130 +++++++++++++++++++++ test/routines/level3/xsyrk.h | 121 ++++++++++++++++++++ test/routines/level3/xtrmm.h | 127 ++++++++++++++++++++ test/routines/xaxpy.h | 113 ------------------ test/routines/xgemm.h | 134 ---------------------- test/routines/xgemv.h | 132 --------------------- test/routines/xhemm.h | 134 ---------------------- test/routines/xher2k.h | 132 --------------------- test/routines/xherk.h | 121 -------------------- test/routines/xsymm.h | 134 ---------------------- test/routines/xsyr2k.h | 130 --------------------- test/routines/xsyrk.h | 121 -------------------- test/routines/xtrmm.h | 127 -------------------- 102 files changed, 4667 insertions(+), 4642 deletions(-) create mode 100644 include/internal/routines/level1/xaxpy.h create mode 100644 include/internal/routines/level2/xgemv.h create mode 100644 include/internal/routines/level3/xgemm.h create mode 100644 include/internal/routines/level3/xhemm.h create mode 100644 include/internal/routines/level3/xher2k.h create mode 100644 include/internal/routines/level3/xherk.h create mode 100644 include/internal/routines/level3/xsymm.h create mode 100644 include/internal/routines/level3/xsyr2k.h create mode 100644 include/internal/routines/level3/xsyrk.h create mode 100644 include/internal/routines/level3/xtrmm.h delete mode 100644 include/internal/routines/xaxpy.h delete mode 100644 include/internal/routines/xgemm.h delete mode 100644 include/internal/routines/xgemv.h delete mode 100644 include/internal/routines/xhemm.h delete mode 100644 include/internal/routines/xher2k.h delete mode 100644 include/internal/routines/xherk.h delete mode 100644 include/internal/routines/xsymm.h delete mode 100644 include/internal/routines/xsyr2k.h delete mode 100644 include/internal/routines/xsyrk.h delete mode 100644 include/internal/routines/xtrmm.h create mode 100644 src/routines/level1/xaxpy.cc create mode 100644 src/routines/level2/xgemv.cc create mode 100644 src/routines/level3/xgemm.cc create mode 100644 src/routines/level3/xhemm.cc create mode 100644 src/routines/level3/xher2k.cc create mode 100644 src/routines/level3/xherk.cc create mode 100644 src/routines/level3/xsymm.cc create mode 100644 src/routines/level3/xsyr2k.cc create mode 100644 src/routines/level3/xsyrk.cc create mode 100644 src/routines/level3/xtrmm.cc delete mode 100644 src/routines/xaxpy.cc delete mode 100644 src/routines/xgemm.cc delete mode 100644 src/routines/xgemv.cc delete mode 100644 src/routines/xhemm.cc delete mode 100644 src/routines/xher2k.cc delete mode 100644 src/routines/xherk.cc delete mode 100644 src/routines/xsymm.cc delete mode 100644 src/routines/xsyr2k.cc delete mode 100644 src/routines/xsyrk.cc delete mode 100644 src/routines/xtrmm.cc create mode 100644 test/correctness/routines/level1/xaxpy.cc create mode 100644 test/correctness/routines/level2/xgemv.cc create mode 100644 test/correctness/routines/level3/xgemm.cc create mode 100644 test/correctness/routines/level3/xhemm.cc create mode 100644 test/correctness/routines/level3/xher2k.cc create mode 100644 test/correctness/routines/level3/xherk.cc create mode 100644 test/correctness/routines/level3/xsymm.cc create mode 100644 test/correctness/routines/level3/xsyr2k.cc create mode 100644 test/correctness/routines/level3/xsyrk.cc create mode 100644 test/correctness/routines/level3/xtrmm.cc delete mode 100644 test/correctness/routines/xaxpy.cc delete mode 100644 test/correctness/routines/xgemm.cc delete mode 100644 test/correctness/routines/xgemv.cc delete mode 100644 test/correctness/routines/xhemm.cc delete mode 100644 test/correctness/routines/xher2k.cc delete mode 100644 test/correctness/routines/xherk.cc delete mode 100644 test/correctness/routines/xsymm.cc delete mode 100644 test/correctness/routines/xsyr2k.cc delete mode 100644 test/correctness/routines/xsyrk.cc delete mode 100644 test/correctness/routines/xtrmm.cc create mode 100644 test/performance/routines/level1/xaxpy.cc create mode 100644 test/performance/routines/level2/xgemv.cc create mode 100644 test/performance/routines/level3/xgemm.cc create mode 100644 test/performance/routines/level3/xhemm.cc create mode 100644 test/performance/routines/level3/xher2k.cc create mode 100644 test/performance/routines/level3/xherk.cc create mode 100644 test/performance/routines/level3/xsymm.cc create mode 100644 test/performance/routines/level3/xsyr2k.cc create mode 100644 test/performance/routines/level3/xsyrk.cc create mode 100644 test/performance/routines/level3/xtrmm.cc delete mode 100644 test/performance/routines/xaxpy.cc delete mode 100644 test/performance/routines/xgemm.cc delete mode 100644 test/performance/routines/xgemv.cc delete mode 100644 test/performance/routines/xhemm.cc delete mode 100644 test/performance/routines/xher2k.cc delete mode 100644 test/performance/routines/xherk.cc delete mode 100644 test/performance/routines/xsymm.cc delete mode 100644 test/performance/routines/xsyr2k.cc delete mode 100644 test/performance/routines/xsyrk.cc delete mode 100644 test/performance/routines/xtrmm.cc create mode 100644 test/routines/level1/xaxpy.h create mode 100644 test/routines/level2/xgemv.h create mode 100644 test/routines/level3/xgemm.h create mode 100644 test/routines/level3/xhemm.h create mode 100644 test/routines/level3/xher2k.h create mode 100644 test/routines/level3/xherk.h create mode 100644 test/routines/level3/xsymm.h create mode 100644 test/routines/level3/xsyr2k.h create mode 100644 test/routines/level3/xsyrk.h create mode 100644 test/routines/level3/xtrmm.h delete mode 100644 test/routines/xaxpy.h delete mode 100644 test/routines/xgemm.h delete mode 100644 test/routines/xgemv.h delete mode 100644 test/routines/xhemm.h delete mode 100644 test/routines/xher2k.h delete mode 100644 test/routines/xherk.h delete mode 100644 test/routines/xsymm.h delete mode 100644 test/routines/xsyr2k.h delete mode 100644 test/routines/xsyrk.h delete mode 100644 test/routines/xtrmm.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b2c5657..c97ddd5b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,17 +95,23 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS}) # Sets the supported routines and the used kernels. New routines and kernels should be added here. set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm) set(SAMPLE_PROGRAMS sgemm) -set(ROUTINES - xaxpy - xgemv - xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm) +set(LEVEL1_ROUTINES xaxpy) +set(LEVEL2_ROUTINES xgemv) +set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm) +set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES}) # ================================================================================================== # Gathers all source-files set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc) -foreach(ROUTINE ${ROUTINES}) - set(SOURCES ${SOURCES} src/routines/${ROUTINE}.cc) +foreach(ROUTINE ${LEVEL1_ROUTINES}) + set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc) +endforeach() +foreach(ROUTINE ${LEVEL2_ROUTINES}) + set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cc) +endforeach() +foreach(ROUTINE ${LEVEL3_ROUTINES}) + set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc) endforeach() # Creates and links the library @@ -172,10 +178,19 @@ if(TESTS) test/correctness/tester.cc test/correctness/testblas.cc) # Compiles the correctness-tests + foreach(ROUTINE ${LEVEL1_ROUTINES}) + add_executable(test_${ROUTINE} $ + test/correctness/routines/level1/${ROUTINE}.cc) + endforeach() + foreach(ROUTINE ${LEVEL2_ROUTINES}) + add_executable(test_${ROUTINE} $ + test/correctness/routines/level2/${ROUTINE}.cc) + endforeach() + foreach(ROUTINE ${LEVEL3_ROUTINES}) + add_executable(test_${ROUTINE} $ + test/correctness/routines/level3/${ROUTINE}.cc) + endforeach() foreach(ROUTINE ${ROUTINES}) - add_executable(test_${ROUTINE} - $ - test/correctness/routines/${ROUTINE}.cc) target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES}) install(TARGETS test_${ROUTINE} DESTINATION bin) endforeach() @@ -184,9 +199,19 @@ if(TESTS) add_library(test_performance_common OBJECT test/performance/client.cc) # Compiles the performance-tests - foreach(ROUTINE ${ROUTINES}) + foreach(ROUTINE ${LEVEL1_ROUTINES}) add_executable(client_${ROUTINE} $ - test/performance/routines/${ROUTINE}.cc) + test/performance/routines/level1/${ROUTINE}.cc) + endforeach() + foreach(ROUTINE ${LEVEL2_ROUTINES}) + add_executable(client_${ROUTINE} $ + test/performance/routines/level2/${ROUTINE}.cc) + endforeach() + foreach(ROUTINE ${LEVEL3_ROUTINES}) + add_executable(client_${ROUTINE} $ + test/performance/routines/level3/${ROUTINE}.cc) + endforeach() + foreach(ROUTINE ${ROUTINES}) target_link_libraries(client_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES}) install(TARGETS client_${ROUTINE} DESTINATION bin) endforeach() diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h new file mode 100644 index 00000000..e548e553 --- /dev/null +++ b/include/internal/routines/level1/xaxpy.h @@ -0,0 +1,42 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xaxpy routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XAXPY_H_ +#define CLBLAST_ROUTINES_XAXPY_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xaxpy: public Routine { + public: + Xaxpy(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoAxpy(const size_t n, const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XAXPY_H_ +#endif diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h new file mode 100644 index 00000000..a3109036 --- /dev/null +++ b/include/internal/routines/level2/xgemv.h @@ -0,0 +1,46 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgemv routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XGEMV_H_ +#define CLBLAST_ROUTINES_XGEMV_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xgemv: public Routine { + public: + Xgemv(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoGemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XGEMV_H_ +#endif diff --git a/include/internal/routines/level3/xgemm.h b/include/internal/routines/level3/xgemm.h new file mode 100644 index 00000000..7ad4fcfb --- /dev/null +++ b/include/internal/routines/level3/xgemm.h @@ -0,0 +1,46 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgemm routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XGEMM_H_ +#define CLBLAST_ROUTINES_XGEMM_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xgemm: public Routine { + public: + Xgemm(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XGEMM_H_ +#endif diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h new file mode 100644 index 00000000..6cc9d9ec --- /dev/null +++ b/include/internal/routines/level3/xhemm.h @@ -0,0 +1,58 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhemm routine. It is based on the generalized matrix multiplication +// routine (Xgemm). The implementation is very similar to the Xsymm routine. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHEMM_H_ +#define CLBLAST_ROUTINES_XHEMM_H_ + +#include "internal/routines/level3/xgemm.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xhemm: public Xgemm { + public: + + // Uses several variables from the Routine class + using Routine::db_; + using Routine::context_; + + // Uses several helper functions from the Routine class + using Routine::RunKernel; + using Routine::ErrorIn; + using Routine::TestMatrixA; + using Routine::GetProgramFromCache; + + // Uses the regular Xgemm routine + using Xgemm::DoGemm; + + // Constructor + Xhemm(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHEMM_H_ +#endif diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h new file mode 100644 index 00000000..1836a812 --- /dev/null +++ b/include/internal/routines/level3/xher2k.h @@ -0,0 +1,48 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xher2k routine. The precision is implemented using the template argument +// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the +// Xsyr2k routine. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHER2K_H_ +#define CLBLAST_ROUTINES_XHER2K_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xher2k: public Routine { + public: + Xher2k(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHER2K_H_ +#endif diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h new file mode 100644 index 00000000..9b361254 --- /dev/null +++ b/include/internal/routines/level3/xherk.h @@ -0,0 +1,47 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xherk routine. The precision is implemented using the template argument +// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the +// Xsyrk routine. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHERK_H_ +#define CLBLAST_ROUTINES_XHERK_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xherk: public Routine { + public: + Xherk(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const U alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const U beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHERK_H_ +#endif diff --git a/include/internal/routines/level3/xsymm.h b/include/internal/routines/level3/xsymm.h new file mode 100644 index 00000000..2028ceea --- /dev/null +++ b/include/internal/routines/level3/xsymm.h @@ -0,0 +1,60 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsymm routine. It is based on the generalized matrix multiplication +// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the +// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by +// transforming it into a general matrix, and then calls the regular GEMM code. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYMM_H_ +#define CLBLAST_ROUTINES_XSYMM_H_ + +#include "internal/routines/level3/xgemm.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsymm: public Xgemm { + public: + + // Uses several variables from the Routine class + using Routine::db_; + using Routine::context_; + + // Uses several helper functions from the Routine class + using Routine::RunKernel; + using Routine::ErrorIn; + using Routine::TestMatrixA; + using Routine::GetProgramFromCache; + + // Uses the regular Xgemm routine + using Xgemm::DoGemm; + + // Constructor + Xsymm(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYMM_H_ +#endif diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h new file mode 100644 index 00000000..6259313c --- /dev/null +++ b/include/internal/routines/level3/xsyr2k.h @@ -0,0 +1,48 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2k routine. The precision is implemented using a template argument. +// The implementation is very similar to Xsyrk (see header for details), except for the fact that +// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYR2K_H_ +#define CLBLAST_ROUTINES_XSYR2K_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsyr2k: public Routine { + public: + Xsyr2k(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYR2K_H_ +#endif diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h new file mode 100644 index 00000000..3dab731f --- /dev/null +++ b/include/internal/routines/level3/xsyrk.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyrk routine. The precision is implemented using a template argument. +// The implementation is based on the regular Xgemm routine and kernel, but with two main changes: +// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part. +// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for +// performance reasons, as the actual masking is done later (see the first point). +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYRK_H_ +#define CLBLAST_ROUTINES_XSYRK_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsyrk: public Routine { + public: + Xsyrk(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYRK_H_ +#endif diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h new file mode 100644 index 00000000..4f49bebd --- /dev/null +++ b/include/internal/routines/level3/xtrmm.h @@ -0,0 +1,58 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrmm routine. The implementation is based on first transforming the +// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM +// routine. Therefore, this class inherits from the Xgemm class. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTRMM_H_ +#define CLBLAST_ROUTINES_XTRMM_H_ + +#include "internal/routines/level3/xgemm.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xtrmm: public Xgemm { + public: + + // Uses several variables from the Routine class + using Routine::db_; + using Routine::context_; + + // Uses several helper functions from the Routine class + using Routine::RunKernel; + using Routine::ErrorIn; + using Routine::TestMatrixA; + using Routine::GetProgramFromCache; + + // Uses the regular Xgemm routine + using Xgemm::DoGemm; + + // Constructor + Xtrmm(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTRMM_H_ +#endif diff --git a/include/internal/routines/xaxpy.h b/include/internal/routines/xaxpy.h deleted file mode 100644 index e548e553..00000000 --- a/include/internal/routines/xaxpy.h +++ /dev/null @@ -1,42 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xaxpy routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XAXPY_H_ -#define CLBLAST_ROUTINES_XAXPY_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class Xaxpy: public Routine { - public: - Xaxpy(CommandQueue &queue, Event &event); - - // Templated-precision implementation of the routine - StatusCode DoAxpy(const size_t n, const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); - - private: - // Static variable to get the precision - const static Precision precision_; -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XAXPY_H_ -#endif diff --git a/include/internal/routines/xgemm.h b/include/internal/routines/xgemm.h deleted file mode 100644 index 7ad4fcfb..00000000 --- a/include/internal/routines/xgemm.h +++ /dev/null @@ -1,46 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgemm routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XGEMM_H_ -#define CLBLAST_ROUTINES_XGEMM_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class Xgemm: public Routine { - public: - Xgemm(CommandQueue &queue, Event &event); - - // Templated-precision implementation of the routine - StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); - - private: - // Static variable to get the precision - const static Precision precision_; -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XGEMM_H_ -#endif diff --git a/include/internal/routines/xgemv.h b/include/internal/routines/xgemv.h deleted file mode 100644 index a3109036..00000000 --- a/include/internal/routines/xgemv.h +++ /dev/null @@ -1,46 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgemv routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XGEMV_H_ -#define CLBLAST_ROUTINES_XGEMV_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class Xgemv: public Routine { - public: - Xgemv(CommandQueue &queue, Event &event); - - // Templated-precision implementation of the routine - StatusCode DoGemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); - - private: - // Static variable to get the precision - const static Precision precision_; -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XGEMV_H_ -#endif diff --git a/include/internal/routines/xhemm.h b/include/internal/routines/xhemm.h deleted file mode 100644 index 1b1a0dfa..00000000 --- a/include/internal/routines/xhemm.h +++ /dev/null @@ -1,58 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xhemm routine. It is based on the generalized matrix multiplication -// routine (Xgemm). The implementation is very similar to the Xsymm routine. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHEMM_H_ -#define CLBLAST_ROUTINES_XHEMM_H_ - -#include "internal/routines/xgemm.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class Xhemm: public Xgemm { - public: - - // Uses several variables from the Routine class - using Routine::db_; - using Routine::context_; - - // Uses several helper functions from the Routine class - using Routine::RunKernel; - using Routine::ErrorIn; - using Routine::TestMatrixA; - using Routine::GetProgramFromCache; - - // Uses the regular Xgemm routine - using Xgemm::DoGemm; - - // Constructor - Xhemm(CommandQueue &queue, Event &event); - - // Templated-precision implementation of the routine - StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHEMM_H_ -#endif diff --git a/include/internal/routines/xher2k.h b/include/internal/routines/xher2k.h deleted file mode 100644 index 1836a812..00000000 --- a/include/internal/routines/xher2k.h +++ /dev/null @@ -1,48 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xher2k routine. The precision is implemented using the template argument -// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the -// Xsyr2k routine. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHER2K_H_ -#define CLBLAST_ROUTINES_XHER2K_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class Xher2k: public Routine { - public: - Xher2k(CommandQueue &queue, Event &event); - - // Templated-precision implementation of the routine - StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); - - private: - // Static variable to get the precision - const static Precision precision_; -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHER2K_H_ -#endif diff --git a/include/internal/routines/xherk.h b/include/internal/routines/xherk.h deleted file mode 100644 index 9b361254..00000000 --- a/include/internal/routines/xherk.h +++ /dev/null @@ -1,47 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xherk routine. The precision is implemented using the template argument -// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the -// Xsyrk routine. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHERK_H_ -#define CLBLAST_ROUTINES_XHERK_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class Xherk: public Routine { - public: - Xherk(CommandQueue &queue, Event &event); - - // Templated-precision implementation of the routine - StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const U alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); - - private: - // Static variable to get the precision - const static Precision precision_; -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHERK_H_ -#endif diff --git a/include/internal/routines/xsymm.h b/include/internal/routines/xsymm.h deleted file mode 100644 index c6545164..00000000 --- a/include/internal/routines/xsymm.h +++ /dev/null @@ -1,60 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsymm routine. It is based on the generalized matrix multiplication -// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the -// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by -// transforming it into a general matrix, and then calls the regular GEMM code. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSYMM_H_ -#define CLBLAST_ROUTINES_XSYMM_H_ - -#include "internal/routines/xgemm.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class Xsymm: public Xgemm { - public: - - // Uses several variables from the Routine class - using Routine::db_; - using Routine::context_; - - // Uses several helper functions from the Routine class - using Routine::RunKernel; - using Routine::ErrorIn; - using Routine::TestMatrixA; - using Routine::GetProgramFromCache; - - // Uses the regular Xgemm routine - using Xgemm::DoGemm; - - // Constructor - Xsymm(CommandQueue &queue, Event &event); - - // Templated-precision implementation of the routine - StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSYMM_H_ -#endif diff --git a/include/internal/routines/xsyr2k.h b/include/internal/routines/xsyr2k.h deleted file mode 100644 index 6259313c..00000000 --- a/include/internal/routines/xsyr2k.h +++ /dev/null @@ -1,48 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsyr2k routine. The precision is implemented using a template argument. -// The implementation is very similar to Xsyrk (see header for details), except for the fact that -// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSYR2K_H_ -#define CLBLAST_ROUTINES_XSYR2K_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class Xsyr2k: public Routine { - public: - Xsyr2k(CommandQueue &queue, Event &event); - - // Templated-precision implementation of the routine - StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); - - private: - // Static variable to get the precision - const static Precision precision_; -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSYR2K_H_ -#endif diff --git a/include/internal/routines/xsyrk.h b/include/internal/routines/xsyrk.h deleted file mode 100644 index 3dab731f..00000000 --- a/include/internal/routines/xsyrk.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsyrk routine. The precision is implemented using a template argument. -// The implementation is based on the regular Xgemm routine and kernel, but with two main changes: -// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part. -// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for -// performance reasons, as the actual masking is done later (see the first point). -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSYRK_H_ -#define CLBLAST_ROUTINES_XSYRK_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class Xsyrk: public Routine { - public: - Xsyrk(CommandQueue &queue, Event &event); - - // Templated-precision implementation of the routine - StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); - - private: - // Static variable to get the precision - const static Precision precision_; -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSYRK_H_ -#endif diff --git a/include/internal/routines/xtrmm.h b/include/internal/routines/xtrmm.h deleted file mode 100644 index af9f0266..00000000 --- a/include/internal/routines/xtrmm.h +++ /dev/null @@ -1,58 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xtrmm routine. The implementation is based on first transforming the -// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM -// routine. Therefore, this class inherits from the Xgemm class. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XTRMM_H_ -#define CLBLAST_ROUTINES_XTRMM_H_ - -#include "internal/routines/xgemm.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class Xtrmm: public Xgemm { - public: - - // Uses several variables from the Routine class - using Routine::db_; - using Routine::context_; - - // Uses several helper functions from the Routine class - using Routine::RunKernel; - using Routine::ErrorIn; - using Routine::TestMatrixA; - using Routine::GetProgramFromCache; - - // Uses the regular Xgemm routine - using Xgemm::DoGemm; - - // Constructor - Xtrmm(CommandQueue &queue, Event &event); - - // Templated-precision implementation of the routine - StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XTRMM_H_ -#endif diff --git a/src/clblast.cc b/src/clblast.cc index 23046b01..b5d53ee6 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -18,20 +18,20 @@ #include "clblast.h" // BLAS level-1 includes -#include "internal/routines/xaxpy.h" +#include "internal/routines/level1/xaxpy.h" // BLAS level-2 includes -#include "internal/routines/xgemv.h" +#include "internal/routines/level2/xgemv.h" // BLAS level-3 includes -#include "internal/routines/xgemm.h" -#include "internal/routines/xsymm.h" -#include "internal/routines/xhemm.h" -#include "internal/routines/xsyrk.h" -#include "internal/routines/xherk.h" -#include "internal/routines/xsyr2k.h" -#include "internal/routines/xher2k.h" -#include "internal/routines/xtrmm.h" +#include "internal/routines/level3/xgemm.h" +#include "internal/routines/level3/xsymm.h" +#include "internal/routines/level3/xhemm.h" +#include "internal/routines/level3/xsyrk.h" +#include "internal/routines/level3/xherk.h" +#include "internal/routines/level3/xsyr2k.h" +#include "internal/routines/level3/xher2k.h" +#include "internal/routines/level3/xtrmm.h" namespace clblast { // ================================================================================================= diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc new file mode 100644 index 00000000..fba36851 --- /dev/null +++ b/src/routines/level1/xaxpy.cc @@ -0,0 +1,115 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xaxpy class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xaxpy.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xaxpy::precision_ = Precision::kSingle; +template <> const Precision Xaxpy::precision_ = Precision::kDouble; +template <> const Precision Xaxpy::precision_ = Precision::kComplexSingle; +template <> const Precision Xaxpy::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xaxpy::Xaxpy(CommandQueue &queue, Event &event): + Routine(queue, event, {"Xaxpy"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + (y_offset == 0) && (y_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; + + // Retrieves the Xaxpy kernel from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast(x_offset)); + kernel.SetArgument(4, static_cast(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, static_cast(y_offset)); + kernel.SetArgument(7, static_cast(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xaxpy; +template class Xaxpy; +template class Xaxpy; +template class Xaxpy; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc new file mode 100644 index 00000000..181337b6 --- /dev/null +++ b/src/routines/level2/xgemv.cc @@ -0,0 +1,146 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgemv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xgemv.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xgemv::precision_ = Precision::kSingle; +template <> const Precision Xgemv::precision_ = Precision::kDouble; +template <> const Precision Xgemv::precision_ = Precision::kComplexSingle; +template <> const Precision Xgemv::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xgemv::Xgemv(CommandQueue &queue, Event &event): + Routine(queue, event, {"Xgemv"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrix has an alternative layout (row or column-major). + auto a_altlayout = (layout == Layout::kRowMajor); + auto a_one = (a_altlayout) ? n : m; + auto a_two = (a_altlayout) ? m : n; + + // Swap m and n if the matrix is transposed + auto a_transposed = (a_transpose != Transpose::kNo); + auto m_real = (a_transposed) ? n : m; + auto n_real = (a_transposed) ? m : n; + + // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator) + auto a_rotated = a_transposed ^ a_altlayout; + + // In case of complex data-types, the transpose can also become a conjugate transpose + auto a_conjugate = (a_transpose == Transpose::kConjugate); + + // Tests the matrix and the vectors for validity + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && + IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && + IsMultiple(n, db_["WGS2"]) && + IsMultiple(a_ld, db_["VW2"]); + bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && + IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && + IsMultiple(n, db_["WGS3"]) && + IsMultiple(a_ld, db_["VW3"]); + + // If possible, run the fast-version (rotated or non-rotated) of the kernel + auto kernel_name = "Xgemv"; + auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]); + auto global_size = m_ceiled / db_["WPT1"]; + auto local_size = db_["WGS1"]; + if (use_fast_kernel) { + kernel_name = "XgemvFast"; + global_size = m_real / db_["WPT2"]; + local_size = db_["WGS2"]; + } + if (use_fast_kernel_rot) { + kernel_name = "XgemvFastRot"; + global_size = m_real / db_["WPT3"]; + local_size = db_["WGS3"]; + } + + // Retrieves the Xgemv kernel from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(m_real)); + kernel.SetArgument(1, static_cast(n_real)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, beta); + kernel.SetArgument(4, static_cast(a_rotated)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast(a_offset)); + kernel.SetArgument(7, static_cast(a_ld)); + kernel.SetArgument(8, x_buffer()); + kernel.SetArgument(9, static_cast(x_offset)); + kernel.SetArgument(10, static_cast(x_inc)); + kernel.SetArgument(11, y_buffer()); + kernel.SetArgument(12, static_cast(y_offset)); + kernel.SetArgument(13, static_cast(y_inc)); + kernel.SetArgument(14, static_cast(a_conjugate)); + + // Launches the kernel + auto global = std::vector{global_size}; + auto local = std::vector{local_size}; + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgemv; +template class Xgemv; +template class Xgemv; +template class Xgemv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc new file mode 100644 index 00000000..f4a9f737 --- /dev/null +++ b/src/routines/level3/xgemm.cc @@ -0,0 +1,172 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgemm class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xgemm.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xgemm::precision_ = Precision::kSingle; +template <> const Precision Xgemm::precision_ = Precision::kDouble; +template <> const Precision Xgemm::precision_ = Precision::kComplexSingle; +template <> const Precision Xgemm::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xgemm::Xgemm(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xgemm::DoGemm(const Layout layout, + const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. Note + // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of + // col-major) to be transformed, so transposing requirements are not the same as whether or not + // the matrix is actually transposed in memory. + auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); + auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + auto a_do_transpose = a_rotated; + auto b_do_transpose = !b_rotated; + auto c_do_transpose = c_rotated; + + // In case of complex data-types, the transpose can also become a conjugate transpose + auto a_conjugate = (a_transpose == Transpose::kConjugate); + auto b_conjugate = (b_transpose == Transpose::kConjugate); + + // Computes the first and second dimensions of the 3 matrices taking into account whether the + // matrices are rotated or not + auto a_one = (a_rotated) ? k : m; + auto a_two = (a_rotated) ? m : k; + auto b_one = (b_rotated) ? n : k; + auto b_two = (b_rotated) ? k : n; + auto c_one = (c_rotated) ? n : m; + auto c_two = (c_rotated) ? m : n; + + // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than K when rotated, or less than M when not-rotated + // matrix B cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N when rotated, or less than M when not-rotated + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of m, n, and k + auto m_ceiled = Ceil(m, db_["MWG"]); + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T)); + auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill + // them up until they reach a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + m_ceiled, k_ceiled, m_ceiled, 0, temp_a, + a_do_transpose, a_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b, + b_do_transpose, b_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Only necessary for matrix C if it used both as input and output + if (beta != static_cast(0)) { + status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer, + m_ceiled, n_ceiled, m_ceiled, 0, temp_c, + c_do_transpose, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + } + + // Retrieves the Xgemm kernel from the compiled binary + try { + auto kernel = Kernel(program, "Xgemm"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(m_ceiled)); + kernel.SetArgument(1, static_cast(n_ceiled)); + kernel.SetArgument(2, static_cast(k_ceiled)); + kernel.SetArgument(3, alpha); + kernel.SetArgument(4, beta); + kernel.SetArgument(5, temp_a()); + kernel.SetArgument(6, temp_b()); + kernel.SetArgument(7, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (m_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c, + c_one, c_two, c_ld, c_offset, c_buffer, + c_do_transpose, false, false, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgemm; +template class Xgemm; +template class Xgemm; +template class Xgemm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc new file mode 100644 index 00000000..bc257c44 --- /dev/null +++ b/src/routines/level3/xhemm.cc @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhemm class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xhemm.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xhemm::Xhemm(CommandQueue &queue, Event &event): + Xgemm(queue, event) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xhemm::DoHemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the + // left) or B (on the right) in the Xgemm routine. + auto k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the squared A matrix + auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as + // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix + bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared"; + + // Temporary buffer for a copy of the hermitian matrix + try { + auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + + // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm + // routine afterwards + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the hermitian-to-squared kernel + kernel.SetArgument(0, static_cast(k)); + kernel.SetArgument(1, static_cast(a_ld)); + kernel.SetArgument(2, static_cast(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast(k)); + kernel.SetArgument(5, static_cast(k)); + kernel.SetArgument(6, static_cast(0)); + kernel.SetArgument(7, temp_herm()); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // hermitian-to-squared kernel uses the same parameters. + auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the regular Xgemm code with either "C := AB+C" or ... + if (side == Side::kLeft) { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + temp_herm, 0, k, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld); + } + + // ... with "C := BA+C". Note that A and B are now reversed. + else { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_herm, 0, k, + beta, + c_buffer, c_offset, c_ld); + + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(status) { + case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; + case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; + case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; + case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; + case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; + case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; + } + } + + // Return the status of the Xgemm routine + return status; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhemm; +template class Xhemm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc new file mode 100644 index 00000000..6d33a0e1 --- /dev/null +++ b/src/routines/level3/xher2k.cc @@ -0,0 +1,178 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xher2k class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xher2k.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xher2k::precision_ = Precision::kComplexSingle; +template <> const Precision Xher2k::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xher2k::Xher2k(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or + // to matrix A (argument: conjugate transpose) + auto ab_conjugate = (ab_transpose != Transpose::kNo); + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) || + (layout == Layout::kRowMajor && !ab_conjugate); + auto c_rotated = (layout == Layout::kRowMajor); + + // Computes the first and second dimensions of the A and B matrices taking the layout into account + auto ab_one = (ab_rotated) ? k : n; + auto ab_two = (ab_rotated) ? n : k; + + // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix B cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_b1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_a2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_b2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to + // fill them up until they reach a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a1, + ab_rotated, ab_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a2, + ab_rotated, !ab_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b1, + ab_rotated, ab_conjugate, true, false, false, false, program); + status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b2, + ab_rotated, !ab_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + auto complex_beta = T{beta, static_cast(0.0)}; + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, complex_beta); + kernel.SetArgument(4, temp_a1()); + kernel.SetArgument(5, temp_b2()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha + auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; + auto complex_one = T{static_cast(1.0), static_cast(0.0)}; + kernel.SetArgument(2, conjugate_alpha); + kernel.SetArgument(3, complex_one); + kernel.SetArgument(4, temp_b1()); + kernel.SetArgument(5, temp_a2()); + + // Runs the kernel again + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, true, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xher2k; +template class Xher2k; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc new file mode 100644 index 00000000..8fae294f --- /dev/null +++ b/src/routines/level3/xherk.cc @@ -0,0 +1,156 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xherk class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xherk.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xherk::precision_ = Precision::kComplexSingle; +template <> const Precision Xherk::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xherk::Xherk(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const U alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const U beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or + // to matrix A (argument: conjugate transpose) + auto a_conjugate = (a_transpose != Transpose::kNo); + auto b_conjugate = (a_transpose == Transpose::kNo); + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto a_rotated = (layout == Layout::kColMajor && a_conjugate) || + (layout == Layout::kRowMajor && !a_conjugate); + auto c_rotated = (layout == Layout::kRowMajor); + + // Computes the first and second dimensions of the A matrix taking the layout into account + auto a_one = (a_rotated) ? k : n; + auto a_two = (a_rotated) ? n : k; + + // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to + // fill it up until it reaches a certain multiple of size (kernel parameter dependent). It + // creates two copies: + status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a, + a_rotated, a_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b, + a_rotated, b_conjugate, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + auto complex_alpha = T{alpha, static_cast(0.0)}; + auto complex_beta = T{beta, static_cast(0.0)}; + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, complex_alpha); + kernel.SetArgument(3, complex_beta); + kernel.SetArgument(4, temp_a()); + kernel.SetArgument(5, temp_b()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, true, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xherk; +template class Xherk; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc new file mode 100644 index 00000000..1d17f0eb --- /dev/null +++ b/src/routines/level3/xsymm.cc @@ -0,0 +1,132 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsymm class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xsymm.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsymm::Xsymm(CommandQueue &queue, Event &event): + Xgemm(queue, event) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsymm::DoSymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the + // left) or B (on the right) in the Xgemm routine. + auto k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the squared A matrix + auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as + // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix + bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; + + // Temporary buffer for a copy of the symmetric matrix + try { + auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + + // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm + // routine afterwards + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the symmetric-to-squared kernel + kernel.SetArgument(0, static_cast(k)); + kernel.SetArgument(1, static_cast(a_ld)); + kernel.SetArgument(2, static_cast(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast(k)); + kernel.SetArgument(5, static_cast(k)); + kernel.SetArgument(6, static_cast(0)); + kernel.SetArgument(7, temp_symm()); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // symmetric-to-squared kernel uses the same parameters. + auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the regular Xgemm code with either "C := AB+C" or ... + if (side == Side::kLeft) { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + temp_symm, 0, k, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld); + } + + // ... with "C := BA+C". Note that A and B are now reversed. + else { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_symm, 0, k, + beta, + c_buffer, c_offset, c_ld); + + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(status) { + case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; + case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; + case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; + case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; + case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; + case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; + } + } + + // Return the status of the Xgemm routine + return status; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsymm; +template class Xsymm; +template class Xsymm; +template class Xsymm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc new file mode 100644 index 00000000..d54f2fc1 --- /dev/null +++ b/src/routines/level3/xsyr2k.cc @@ -0,0 +1,166 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2k class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xsyr2k.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xsyr2k::precision_ = Precision::kSingle; +template <> const Precision Xsyr2k::precision_ = Precision::kDouble; +template <> const Precision Xsyr2k::precision_ = Precision::kComplexSingle; +template <> const Precision Xsyr2k::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsyr2k::Xsyr2k(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + + // Computes the first and second dimensions of the A and B matrices taking the layout into account + auto ab_one = (ab_rotated) ? k : n; + auto ab_two = (ab_rotated) ? n : k; + + // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix B cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to + // fill them up until they reach a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a, + ab_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b, + ab_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, beta); + kernel.SetArgument(4, temp_a()); + kernel.SetArgument(5, temp_b()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Swaps the arguments for matrices A and B, and sets 'beta' to 1 + auto one = static_cast(1); + kernel.SetArgument(3, one); + kernel.SetArgument(4, temp_b()); + kernel.SetArgument(5, temp_a()); + + // Runs the kernel again + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, false, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyr2k; +template class Xsyr2k; +template class Xsyr2k; +template class Xsyr2k; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc new file mode 100644 index 00000000..bb952410 --- /dev/null +++ b/src/routines/level3/xsyrk.cc @@ -0,0 +1,147 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyrk class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xsyrk.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xsyrk::precision_ = Precision::kSingle; +template <> const Precision Xsyrk::precision_ = Precision::kDouble; +template <> const Precision Xsyrk::precision_ = Precision::kComplexSingle; +template <> const Precision Xsyrk::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsyrk::Xsyrk(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + + // Computes the first and second dimensions of the A matrix taking the layout into account + auto a_one = (a_rotated) ? k : n; + auto a_two = (a_rotated) ? n : k; + + // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to + // fill it up until it reaches a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a, + a_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, beta); + kernel.SetArgument(4, temp_a()); + kernel.SetArgument(5, temp_a()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, false, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyrk; +template class Xsyrk; +template class Xsyrk; +template class Xsyrk; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc new file mode 100644 index 00000000..52f272e3 --- /dev/null +++ b/src/routines/level3/xtrmm.cc @@ -0,0 +1,135 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrmm class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level3/xtrmm.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xtrmm::Xtrmm(CommandQueue &queue, Event &event): + Xgemm(queue, event) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } + + // Computes the k dimension. This is based on whether or not matrix is A (on the left) + // or B (on the right) in the Xgemm routine. + auto k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the triangular A matrix + auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as + // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix + bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared"; + + // Determines whether or not the triangular matrix is unit-diagonal + auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false; + + // Temporary buffer for a copy of the triangular matrix + try { + auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + + // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm + // routine afterwards + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the triangular-to-squared kernel + kernel.SetArgument(0, static_cast(k)); + kernel.SetArgument(1, static_cast(a_ld)); + kernel.SetArgument(2, static_cast(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast(k)); + kernel.SetArgument(5, static_cast(k)); + kernel.SetArgument(6, static_cast(0)); + kernel.SetArgument(7, temp_triangular()); + kernel.SetArgument(8, static_cast(unit_diagonal)); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // triangular-to-squared kernel uses the same parameters. + auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the regular Xgemm code with either "B := alpha*A*B" or ... + if (side == Side::kLeft) { + status = DoGemm(layout, a_transpose, Transpose::kNo, + m, n, k, + alpha, + temp_triangular, 0, k, + b_buffer, b_offset, b_ld, + static_cast(0.0), + b_buffer, b_offset, b_ld); + } + + // ... with "B := alpha*B*A". Note that A and B are now reversed. + else { + status = DoGemm(layout, Transpose::kNo, a_transpose, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_triangular, 0, k, + static_cast(0.0), + b_buffer, b_offset, b_ld); + + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(status) { + case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; + case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; + case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; + case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; + case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; + case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; + } + } + + // Return the status of the Xgemm routine + return status; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtrmm; +template class Xtrmm; +template class Xtrmm; +template class Xtrmm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/xaxpy.cc b/src/routines/xaxpy.cc deleted file mode 100644 index b68458da..00000000 --- a/src/routines/xaxpy.cc +++ /dev/null @@ -1,115 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xaxpy class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xaxpy.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xaxpy::precision_ = Precision::kSingle; -template <> const Precision Xaxpy::precision_ = Precision::kDouble; -template <> const Precision Xaxpy::precision_ = Precision::kComplexSingle; -template <> const Precision Xaxpy::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xaxpy::Xaxpy(CommandQueue &queue, Event &event): - Routine(queue, event, {"Xaxpy"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines whether or not the fast-version can be used - bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && - (y_offset == 0) && (y_inc == 1) && - IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); - - // If possible, run the fast-version of the kernel - auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; - - // Retrieves the Xaxpy kernel from the compiled binary - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast(x_offset)); - kernel.SetArgument(4, static_cast(x_inc)); - kernel.SetArgument(5, y_buffer()); - kernel.SetArgument(6, static_cast(y_offset)); - kernel.SetArgument(7, static_cast(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector{n_ceiled/db_["WPT"]}; - auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local); - } - if (ErrorIn(status)) { return status; } - - // Waits for all kernels to finish - queue_.Finish(); - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xaxpy; -template class Xaxpy; -template class Xaxpy; -template class Xaxpy; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xgemm.cc b/src/routines/xgemm.cc deleted file mode 100644 index c8674282..00000000 --- a/src/routines/xgemm.cc +++ /dev/null @@ -1,172 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgemm class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xgemm.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xgemm::precision_ = Precision::kSingle; -template <> const Precision Xgemm::precision_ = Precision::kDouble; -template <> const Precision Xgemm::precision_ = Precision::kComplexSingle; -template <> const Precision Xgemm::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xgemm::Xgemm(CommandQueue &queue, Event &event): - Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xgemm::DoGemm(const Layout layout, - const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. Note - // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of - // col-major) to be transformed, so transposing requirements are not the same as whether or not - // the matrix is actually transposed in memory. - auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); - auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); - auto c_rotated = (layout == Layout::kRowMajor); - auto a_do_transpose = a_rotated; - auto b_do_transpose = !b_rotated; - auto c_do_transpose = c_rotated; - - // In case of complex data-types, the transpose can also become a conjugate transpose - auto a_conjugate = (a_transpose == Transpose::kConjugate); - auto b_conjugate = (b_transpose == Transpose::kConjugate); - - // Computes the first and second dimensions of the 3 matrices taking into account whether the - // matrices are rotated or not - auto a_one = (a_rotated) ? k : m; - auto a_two = (a_rotated) ? m : k; - auto b_one = (b_rotated) ? n : k; - auto b_two = (b_rotated) ? k : n; - auto c_one = (c_rotated) ? n : m; - auto c_two = (c_rotated) ? m : n; - - // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than K when rotated, or less than M when not-rotated - // matrix B cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N when rotated, or less than M when not-rotated - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of m, n, and k - auto m_ceiled = Ceil(m, db_["MWG"]); - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Allocates space on the device for padded and/or transposed input and output matrices. - try { - auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T)); - auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T)); - - // Loads the program from the database - auto& program = GetProgramFromCache(); - - // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill - // them up until they reach a certain multiple of size (kernel parameter dependent). - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, - m_ceiled, k_ceiled, m_ceiled, 0, temp_a, - a_do_transpose, a_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - b_do_transpose, b_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Only necessary for matrix C if it used both as input and output - if (beta != static_cast(0)) { - status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer, - m_ceiled, n_ceiled, m_ceiled, 0, temp_c, - c_do_transpose, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - } - - // Retrieves the Xgemm kernel from the compiled binary - try { - auto kernel = Kernel(program, "Xgemm"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(m_ceiled)); - kernel.SetArgument(1, static_cast(n_ceiled)); - kernel.SetArgument(2, static_cast(k_ceiled)); - kernel.SetArgument(3, alpha); - kernel.SetArgument(4, beta); - kernel.SetArgument(5, temp_a()); - kernel.SetArgument(6, temp_b()); - kernel.SetArgument(7, temp_c()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (m_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel - status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c, - c_one, c_two, c_ld, c_offset, c_buffer, - c_do_transpose, false, false, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xgemm; -template class Xgemm; -template class Xgemm; -template class Xgemm; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xgemv.cc b/src/routines/xgemv.cc deleted file mode 100644 index 1868dec4..00000000 --- a/src/routines/xgemv.cc +++ /dev/null @@ -1,146 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgemv class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xgemv.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xgemv::precision_ = Precision::kSingle; -template <> const Precision Xgemv::precision_ = Precision::kDouble; -template <> const Precision Xgemv::precision_ = Precision::kComplexSingle; -template <> const Precision Xgemv::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xgemv::Xgemv(CommandQueue &queue, Event &event): - Routine(queue, event, {"Xgemv"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // Makes sure all dimensions are larger than zero - if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrix has an alternative layout (row or column-major). - auto a_altlayout = (layout == Layout::kRowMajor); - auto a_one = (a_altlayout) ? n : m; - auto a_two = (a_altlayout) ? m : n; - - // Swap m and n if the matrix is transposed - auto a_transposed = (a_transpose != Transpose::kNo); - auto m_real = (a_transposed) ? n : m; - auto n_real = (a_transposed) ? m : n; - - // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator) - auto a_rotated = a_transposed ^ a_altlayout; - - // In case of complex data-types, the transpose can also become a conjugate transpose - auto a_conjugate = (a_transpose == Transpose::kConjugate); - - // Tests the matrix and the vectors for validity - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines whether or not the fast-version can be used - bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && - IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && - IsMultiple(n, db_["WGS2"]) && - IsMultiple(a_ld, db_["VW2"]); - bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && - IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && - IsMultiple(n, db_["WGS3"]) && - IsMultiple(a_ld, db_["VW3"]); - - // If possible, run the fast-version (rotated or non-rotated) of the kernel - auto kernel_name = "Xgemv"; - auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]); - auto global_size = m_ceiled / db_["WPT1"]; - auto local_size = db_["WGS1"]; - if (use_fast_kernel) { - kernel_name = "XgemvFast"; - global_size = m_real / db_["WPT2"]; - local_size = db_["WGS2"]; - } - if (use_fast_kernel_rot) { - kernel_name = "XgemvFastRot"; - global_size = m_real / db_["WPT3"]; - local_size = db_["WGS3"]; - } - - // Retrieves the Xgemv kernel from the compiled binary - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(m_real)); - kernel.SetArgument(1, static_cast(n_real)); - kernel.SetArgument(2, alpha); - kernel.SetArgument(3, beta); - kernel.SetArgument(4, static_cast(a_rotated)); - kernel.SetArgument(5, a_buffer()); - kernel.SetArgument(6, static_cast(a_offset)); - kernel.SetArgument(7, static_cast(a_ld)); - kernel.SetArgument(8, x_buffer()); - kernel.SetArgument(9, static_cast(x_offset)); - kernel.SetArgument(10, static_cast(x_inc)); - kernel.SetArgument(11, y_buffer()); - kernel.SetArgument(12, static_cast(y_offset)); - kernel.SetArgument(13, static_cast(y_inc)); - kernel.SetArgument(14, static_cast(a_conjugate)); - - // Launches the kernel - auto global = std::vector{global_size}; - auto local = std::vector{local_size}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Waits for all kernels to finish - queue_.Finish(); - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xgemv; -template class Xgemv; -template class Xgemv; -template class Xgemv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xhemm.cc b/src/routines/xhemm.cc deleted file mode 100644 index 73f769ed..00000000 --- a/src/routines/xhemm.cc +++ /dev/null @@ -1,130 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xhemm class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xhemm.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xhemm::Xhemm(CommandQueue &queue, Event &event): - Xgemm(queue, event) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xhemm::DoHemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } - - // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the - // left) or B (on the right) in the Xgemm routine. - auto k = (side == Side::kLeft) ? m : n; - - // Checks for validity of the squared A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared"; - - // Temporary buffer for a copy of the hermitian matrix - try { - auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); - - // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm - // routine afterwards - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the hermitian-to-squared kernel - kernel.SetArgument(0, static_cast(k)); - kernel.SetArgument(1, static_cast(a_ld)); - kernel.SetArgument(2, static_cast(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast(k)); - kernel.SetArgument(5, static_cast(k)); - kernel.SetArgument(6, static_cast(0)); - kernel.SetArgument(7, temp_herm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // hermitian-to-squared kernel uses the same parameters. - auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the regular Xgemm code with either "C := AB+C" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - temp_herm, 0, k, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld); - } - - // ... with "C := BA+C". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_herm, 0, k, - beta, - c_buffer, c_offset, c_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } - } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xhemm; -template class Xhemm; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xher2k.cc b/src/routines/xher2k.cc deleted file mode 100644 index b19b743b..00000000 --- a/src/routines/xher2k.cc +++ /dev/null @@ -1,178 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xher2k class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xher2k.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xher2k::precision_ = Precision::kComplexSingle; -template <> const Precision Xher2k::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xher2k::Xher2k(CommandQueue &queue, Event &event): - Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } - - // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or - // to matrix A (argument: conjugate transpose) - auto ab_conjugate = (ab_transpose != Transpose::kNo); - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. - auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) || - (layout == Layout::kRowMajor && !ab_conjugate); - auto c_rotated = (layout == Layout::kRowMajor); - - // Computes the first and second dimensions of the A and B matrices taking the layout into account - auto ab_one = (ab_rotated) ? k : n; - auto ab_two = (ab_rotated) ? n : k; - - // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than K when not-rotated - // matrix B cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N - auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Decides which kernel to run: the upper-triangular or lower-triangular version - auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - - // Allocates space on the device for padded and/or transposed input and output matrices. - try { - auto temp_a1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_b1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_a2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_b2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); - - // Loads the program from the database - auto& program = GetProgramFromCache(); - - // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to - // fill them up until they reach a certain multiple of size (kernel parameter dependent). - status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_a1, - ab_rotated, ab_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_a2, - ab_rotated, !ab_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_b1, - ab_rotated, ab_conjugate, true, false, false, false, program); - status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_b2, - ab_rotated, !ab_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - auto complex_beta = T{beta, static_cast(0.0)}; - kernel.SetArgument(0, static_cast(n_ceiled)); - kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha); - kernel.SetArgument(3, complex_beta); - kernel.SetArgument(4, temp_a1()); - kernel.SetArgument(5, temp_b2()); - kernel.SetArgument(6, temp_c()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha - auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; - auto complex_one = T{static_cast(1.0), static_cast(0.0)}; - kernel.SetArgument(2, conjugate_alpha); - kernel.SetArgument(3, complex_one); - kernel.SetArgument(4, temp_b1()); - kernel.SetArgument(5, temp_a2()); - - // Runs the kernel again - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, true, program); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xher2k; -template class Xher2k; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xherk.cc b/src/routines/xherk.cc deleted file mode 100644 index 6bc9cd6c..00000000 --- a/src/routines/xherk.cc +++ /dev/null @@ -1,156 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xherk class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xherk.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xherk::precision_ = Precision::kComplexSingle; -template <> const Precision Xherk::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xherk::Xherk(CommandQueue &queue, Event &event): - Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const U alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } - - // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or - // to matrix A (argument: conjugate transpose) - auto a_conjugate = (a_transpose != Transpose::kNo); - auto b_conjugate = (a_transpose == Transpose::kNo); - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. - auto a_rotated = (layout == Layout::kColMajor && a_conjugate) || - (layout == Layout::kRowMajor && !a_conjugate); - auto c_rotated = (layout == Layout::kRowMajor); - - // Computes the first and second dimensions of the A matrix taking the layout into account - auto a_one = (a_rotated) ? k : n; - auto a_two = (a_rotated) ? n : k; - - // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Decides which kernel to run: the upper-triangular or lower-triangular version - auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - - // Allocates space on the device for padded and/or transposed input and output matrices. - try { - auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); - - // Loads the program from the database - auto& program = GetProgramFromCache(); - - // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to - // fill it up until it reaches a certain multiple of size (kernel parameter dependent). It - // creates two copies: - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_a, - a_rotated, a_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - a_rotated, b_conjugate, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - auto complex_alpha = T{alpha, static_cast(0.0)}; - auto complex_beta = T{beta, static_cast(0.0)}; - kernel.SetArgument(0, static_cast(n_ceiled)); - kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, complex_alpha); - kernel.SetArgument(3, complex_beta); - kernel.SetArgument(4, temp_a()); - kernel.SetArgument(5, temp_b()); - kernel.SetArgument(6, temp_c()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, true, program); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xherk; -template class Xherk; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xsymm.cc b/src/routines/xsymm.cc deleted file mode 100644 index b39eb24d..00000000 --- a/src/routines/xsymm.cc +++ /dev/null @@ -1,132 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsymm class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xsymm.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsymm::Xsymm(CommandQueue &queue, Event &event): - Xgemm(queue, event) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsymm::DoSymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } - - // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the - // left) or B (on the right) in the Xgemm routine. - auto k = (side == Side::kLeft) ? m : n; - - // Checks for validity of the squared A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; - - // Temporary buffer for a copy of the symmetric matrix - try { - auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); - - // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm - // routine afterwards - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the symmetric-to-squared kernel - kernel.SetArgument(0, static_cast(k)); - kernel.SetArgument(1, static_cast(a_ld)); - kernel.SetArgument(2, static_cast(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast(k)); - kernel.SetArgument(5, static_cast(k)); - kernel.SetArgument(6, static_cast(0)); - kernel.SetArgument(7, temp_symm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // symmetric-to-squared kernel uses the same parameters. - auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the regular Xgemm code with either "C := AB+C" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - temp_symm, 0, k, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld); - } - - // ... with "C := BA+C". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_symm, 0, k, - beta, - c_buffer, c_offset, c_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } - } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsymm; -template class Xsymm; -template class Xsymm; -template class Xsymm; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xsyr2k.cc b/src/routines/xsyr2k.cc deleted file mode 100644 index abb8b7eb..00000000 --- a/src/routines/xsyr2k.cc +++ /dev/null @@ -1,166 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsyr2k class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xsyr2k.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xsyr2k::precision_ = Precision::kSingle; -template <> const Precision Xsyr2k::precision_ = Precision::kDouble; -template <> const Precision Xsyr2k::precision_ = Precision::kComplexSingle; -template <> const Precision Xsyr2k::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsyr2k::Xsyr2k(CommandQueue &queue, Event &event): - Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. - auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo); - auto c_rotated = (layout == Layout::kRowMajor); - - // Computes the first and second dimensions of the A and B matrices taking the layout into account - auto ab_one = (ab_rotated) ? k : n; - auto ab_two = (ab_rotated) ? n : k; - - // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than K when not-rotated - // matrix B cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N - auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Decides which kernel to run: the upper-triangular or lower-triangular version - auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - - // Allocates space on the device for padded and/or transposed input and output matrices. - try { - auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); - - // Loads the program from the database - auto& program = GetProgramFromCache(); - - // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to - // fill them up until they reach a certain multiple of size (kernel parameter dependent). - status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_a, - ab_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - ab_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(n_ceiled)); - kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha); - kernel.SetArgument(3, beta); - kernel.SetArgument(4, temp_a()); - kernel.SetArgument(5, temp_b()); - kernel.SetArgument(6, temp_c()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Swaps the arguments for matrices A and B, and sets 'beta' to 1 - auto one = static_cast(1); - kernel.SetArgument(3, one); - kernel.SetArgument(4, temp_b()); - kernel.SetArgument(5, temp_a()); - - // Runs the kernel again - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, false, program); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsyr2k; -template class Xsyr2k; -template class Xsyr2k; -template class Xsyr2k; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xsyrk.cc b/src/routines/xsyrk.cc deleted file mode 100644 index 3efa0598..00000000 --- a/src/routines/xsyrk.cc +++ /dev/null @@ -1,147 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsyrk class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xsyrk.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Specific implementations to get the memory-type based on a template argument -template <> const Precision Xsyrk::precision_ = Precision::kSingle; -template <> const Precision Xsyrk::precision_ = Precision::kDouble; -template <> const Precision Xsyrk::precision_ = Precision::kComplexSingle; -template <> const Precision Xsyrk::precision_ = Precision::kComplexDouble; - -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsyrk::Xsyrk(CommandQueue &queue, Event &event): - Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. - auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); - auto c_rotated = (layout == Layout::kRowMajor); - - // Computes the first and second dimensions of the A matrix taking the layout into account - auto a_one = (a_rotated) ? k : n; - auto a_two = (a_rotated) ? n : k; - - // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Decides which kernel to run: the upper-triangular or lower-triangular version - auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - - // Allocates space on the device for padded and/or transposed input and output matrices. - try { - auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); - - // Loads the program from the database - auto& program = GetProgramFromCache(); - - // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to - // fill it up until it reaches a certain multiple of size (kernel parameter dependent). - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, temp_a, - a_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, false, program); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(n_ceiled)); - kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha); - kernel.SetArgument(3, beta); - kernel.SetArgument(4, temp_a()); - kernel.SetArgument(5, temp_a()); - kernel.SetArgument(6, temp_c()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, false, program); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsyrk; -template class Xsyrk; -template class Xsyrk; -template class Xsyrk; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/xtrmm.cc b/src/routines/xtrmm.cc deleted file mode 100644 index 543df844..00000000 --- a/src/routines/xtrmm.cc +++ /dev/null @@ -1,135 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xtrmm class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/routines/xtrmm.h" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xtrmm::Xtrmm(CommandQueue &queue, Event &event): - Xgemm(queue, event) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } - - // Computes the k dimension. This is based on whether or not matrix is A (on the left) - // or B (on the right) in the Xgemm routine. - auto k = (side == Side::kLeft) ? m : n; - - // Checks for validity of the triangular A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared"; - - // Determines whether or not the triangular matrix is unit-diagonal - auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false; - - // Temporary buffer for a copy of the triangular matrix - try { - auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); - - // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm - // routine afterwards - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the triangular-to-squared kernel - kernel.SetArgument(0, static_cast(k)); - kernel.SetArgument(1, static_cast(a_ld)); - kernel.SetArgument(2, static_cast(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast(k)); - kernel.SetArgument(5, static_cast(k)); - kernel.SetArgument(6, static_cast(0)); - kernel.SetArgument(7, temp_triangular()); - kernel.SetArgument(8, static_cast(unit_diagonal)); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // triangular-to-squared kernel uses the same parameters. - auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the regular Xgemm code with either "B := alpha*A*B" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, a_transpose, Transpose::kNo, - m, n, k, - alpha, - temp_triangular, 0, k, - b_buffer, b_offset, b_ld, - static_cast(0.0), - b_buffer, b_offset, b_ld); - } - - // ... with "B := alpha*B*A". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, a_transpose, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_triangular, 0, k, - static_cast(0.0), - b_buffer, b_offset, b_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } - } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xtrmm; -template class Xtrmm; -template class Xtrmm; -template class Xtrmm; - -// ================================================================================================= -} // namespace clblast diff --git a/test/correctness/routines/level1/xaxpy.cc b/test/correctness/routines/level1/xaxpy.cc new file mode 100644 index 00000000..ac44caec --- /dev/null +++ b/test/correctness/routines/level1/xaxpy.cc @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xaxpy routine. +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xaxpy.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester +template +void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates a tester + TestBlas tester{argc, argv, silent, name, TestXaxpy::GetOptions(), + TestXaxpy::RunRoutine, TestXaxpy::RunReference, + TestXaxpy::DownloadResult, TestXaxpy::GetResultIndex, + TestXaxpy::ResultID1, TestXaxpy::ResultID2}; + + // This variable holds the arguments relevant for this routine + auto args = Arguments{}; + + // Creates the arguments vector for the regular tests + auto regular_test_vector = std::vector>{}; + for (auto &n: tester.kVectorDims) { args.n = n; + for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc; + for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset; + for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc; + for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset; + for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; + args.x_size = TestXaxpy::GetSizeX(args); + args.y_size = TestXaxpy::GetSizeY(args); + if (args.x_size<1 || args.y_size<1) { continue; } + regular_test_vector.push_back(args); + } + } + } + } + } + } + + // Creates the arguments vector for the invalid-buffer tests + auto invalid_test_vector = std::vector>{}; + args.n = tester.kBufferSize; + args.x_inc = args.y_inc = 1; + args.x_offset = args.y_offset = 0; + for (auto &x_size: tester.kVecSizes) { args.x_size = x_size; + for (auto &y_size: tester.kVecSizes) { args.y_size = y_size; + invalid_test_vector.push_back(args); + } + } + + // Runs the tests + const auto case_name = "default"; + tester.TestRegular(regular_test_vector, case_name); + tester.TestInvalid(invalid_test_vector, case_name); +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTest(argc, argv, false, "SAXPY"); + clblast::RunTest(argc, argv, true, "DAXPY"); + clblast::RunTest(argc, argv, true, "CAXPY"); + clblast::RunTest(argc, argv, true, "ZAXPY"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xgemv.cc b/test/correctness/routines/level2/xgemv.cc new file mode 100644 index 00000000..4e6942cc --- /dev/null +++ b/test/correctness/routines/level2/xgemv.cc @@ -0,0 +1,99 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xgemv routine. +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester +template +void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates a tester + TestBlas tester{argc, argv, silent, name, TestXgemv::GetOptions(), + TestXgemv::RunRoutine, TestXgemv::RunReference, + TestXgemv::DownloadResult, TestXgemv::GetResultIndex, + TestXgemv::ResultID1, TestXgemv::ResultID2}; + + // This variable holds the arguments relevant for this routine + auto args = Arguments{}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { args.layout = layout; + for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose; + + // Creates the arguments vector for the regular tests + auto regular_test_vector = std::vector>{}; + for (auto &m: tester.kMatrixVectorDims) { args.m = m; + for (auto &n: tester.kMatrixVectorDims) { args.n = n; + for (auto &a_ld: tester.kMatrixVectorDims) { args.a_ld = a_ld; + for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; + for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc; + for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset; + for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc; + for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset; + for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; + for (auto &beta: tester.kBetaValues) { args.beta = beta; + args.a_size = TestXgemv::GetSizeA(args); + args.x_size = TestXgemv::GetSizeX(args); + args.y_size = TestXgemv::GetSizeY(args); + if (args.a_size<1 || args.x_size<1 || args.y_size<1) { continue; } + regular_test_vector.push_back(args); + } + } + } + } + } + } + } + } + } + } + + // Creates the arguments vector for the invalid-buffer tests + auto invalid_test_vector = std::vector>{}; + args.m = args.n = tester.kBufferSize; + args.a_ld = tester.kBufferSize; + args.x_inc = args.y_inc = 1; + args.a_offset = args.x_offset = args.y_offset = 0; + for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; + for (auto &x_size: tester.kVecSizes) { args.x_size = x_size; + for (auto &y_size: tester.kVecSizes) { args.y_size = y_size; + invalid_test_vector.push_back(args); + } + } + } + + // Runs the tests + const auto case_name = ToString(layout)+" "+ToString(a_transpose); + tester.TestRegular(regular_test_vector, case_name); + tester.TestInvalid(invalid_test_vector, case_name); + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTest(argc, argv, false, "SGEMV"); + clblast::RunTest(argc, argv, true, "DGEMV"); + clblast::RunTest(argc, argv, true, "CGEMV"); + clblast::RunTest(argc, argv, true, "ZGEMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xgemm.cc b/test/correctness/routines/level3/xgemm.cc new file mode 100644 index 00000000..c1ce8fe2 --- /dev/null +++ b/test/correctness/routines/level3/xgemm.cc @@ -0,0 +1,102 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xgemm routine. +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level3/xgemm.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester +template +void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates a tester + TestBlas tester{argc, argv, silent, name, TestXgemm::GetOptions(), + TestXgemm::RunRoutine, TestXgemm::RunReference, + TestXgemm::DownloadResult, TestXgemm::GetResultIndex, + TestXgemm::ResultID1, TestXgemm::ResultID2}; + + // This variable holds the arguments relevant for this routine + auto args = Arguments{}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { args.layout = layout; + for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose; + for (auto &b_transpose: tester.kTransposes) { args.b_transpose = b_transpose; + + // Creates the arguments vector for the regular tests + auto regular_test_vector = std::vector>{}; + for (auto &m: tester.kMatrixDims) { args.m = m; + for (auto &n: tester.kMatrixDims) { args.n = n; + for (auto &k: tester.kMatrixDims) { args.k = k; + for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; + for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; + for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld; + for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset; + for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; + for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; + for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; + for (auto &beta: tester.kBetaValues) { args.beta = beta; + args.a_size = TestXgemm::GetSizeA(args); + args.b_size = TestXgemm::GetSizeB(args); + args.c_size = TestXgemm::GetSizeC(args); + if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; } + regular_test_vector.push_back(args); + } + } + } + } + } + } + } + } + } + } + } + + // Creates the arguments vector for the invalid-buffer tests + auto invalid_test_vector = std::vector>{}; + args.m = args.n = args.k = tester.kBufferSize; + args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize; + args.a_offset = args.b_offset = args.c_offset = 0; + for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; + for (auto &b_size: tester.kMatSizes) { args.b_size = b_size; + for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; + invalid_test_vector.push_back(args); + } + } + } + + // Runs the tests + const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose); + tester.TestRegular(regular_test_vector, case_name); + tester.TestInvalid(invalid_test_vector, case_name); + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTest(argc, argv, false, "SGEMM"); + clblast::RunTest(argc, argv, true, "DGEMM"); + clblast::RunTest(argc, argv, true, "CGEMM"); + clblast::RunTest(argc, argv, true, "ZGEMM"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xhemm.cc b/test/correctness/routines/level3/xhemm.cc new file mode 100644 index 00000000..4d66a57f --- /dev/null +++ b/test/correctness/routines/level3/xhemm.cc @@ -0,0 +1,98 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xhemm routine. +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level3/xhemm.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester +template +void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates a tester + TestBlas tester{argc, argv, silent, name, TestXhemm::GetOptions(), + TestXhemm::RunRoutine, TestXhemm::RunReference, + TestXhemm::DownloadResult, TestXhemm::GetResultIndex, + TestXhemm::ResultID1, TestXhemm::ResultID2}; + + // This variable holds the arguments relevant for this routine + auto args = Arguments{}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { args.layout = layout; + for (auto &side: tester.kSides) { args.side = side; + for (auto &triangle: tester.kTriangles) { args.triangle = triangle; + + // Creates the arguments vector for the regular tests + auto regular_test_vector = std::vector>{}; + for (auto &m: tester.kMatrixDims) { args.m = m; + for (auto &n: tester.kMatrixDims) { args.n = n; + for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; + for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; + for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld; + for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset; + for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; + for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; + for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; + for (auto &beta: tester.kBetaValues) { args.beta = beta; + args.a_size = TestXhemm::GetSizeA(args); + args.b_size = TestXhemm::GetSizeB(args); + args.c_size = TestXhemm::GetSizeC(args); + if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; } + regular_test_vector.push_back(args); + } + } + } + } + } + } + } + } + } + } + + // Creates the arguments vector for the invalid-buffer tests + auto invalid_test_vector = std::vector>{}; + args.m = args.n = tester.kBufferSize; + args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize; + args.a_offset = args.b_offset = args.c_offset = 0; + for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; + for (auto &b_size: tester.kMatSizes) { args.b_size = b_size; + for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; + invalid_test_vector.push_back(args); + } + } + } + + // Runs the tests + const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle); + tester.TestRegular(regular_test_vector, case_name); + tester.TestInvalid(invalid_test_vector, case_name); + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTest(argc, argv, true, "CHEMM"); + clblast::RunTest(argc, argv, true, "ZHEMM"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xher2k.cc b/test/correctness/routines/level3/xher2k.cc new file mode 100644 index 00000000..ba5260fb --- /dev/null +++ b/test/correctness/routines/level3/xher2k.cc @@ -0,0 +1,100 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xher2k routine. +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level3/xher2k.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester +template +void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates a tester + TestBlas tester{argc, argv, silent, name, TestXher2k::GetOptions(), + TestXher2k::RunRoutine, TestXher2k::RunReference, + TestXher2k::DownloadResult, TestXher2k::GetResultIndex, + TestXher2k::ResultID1, TestXher2k::ResultID2}; + + // This variable holds the arguments relevant for this routine + auto args = Arguments{}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { args.layout = layout; + for (auto &triangle: tester.kTriangles) { args.triangle = triangle; + for (auto &ab_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a + args.a_transpose = ab_transpose; // valid BLAS option + args.b_transpose = ab_transpose; + + // Creates the arguments vector for the regular tests + auto regular_test_vector = std::vector>{}; + for (auto &n: tester.kMatrixDims) { args.n = n; + for (auto &k: tester.kMatrixDims) { args.k = k; + for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; + for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; + for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld; + for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset; + for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; + for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; + for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; + for (auto &beta: tester.kBetaValues) { args.beta = beta; + args.a_size = TestXher2k::GetSizeA(args); + args.b_size = TestXher2k::GetSizeB(args); + args.c_size = TestXher2k::GetSizeC(args); + if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; } + regular_test_vector.push_back(args); + } + } + } + } + } + } + } + } + } + } + + // Creates the arguments vector for the invalid-buffer tests + auto invalid_test_vector = std::vector>{}; + args.n = args.k = tester.kBufferSize; + args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize; + args.a_offset = args.b_offset = args.c_offset = 0; + for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; + for (auto &b_size: tester.kMatSizes) { args.b_size = b_size; + for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; + invalid_test_vector.push_back(args); + } + } + } + + // Runs the tests + const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose); + tester.TestRegular(regular_test_vector, case_name); + tester.TestInvalid(invalid_test_vector, case_name); + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTest(argc, argv, false, "CHER2K"); + clblast::RunTest(argc, argv, true, "ZHER2K"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xherk.cc b/test/correctness/routines/level3/xherk.cc new file mode 100644 index 00000000..7a4a7278 --- /dev/null +++ b/test/correctness/routines/level3/xherk.cc @@ -0,0 +1,92 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xherk routine. +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level3/xherk.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester +template +void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates a tester + TestBlas tester{argc, argv, silent, name, TestXherk::GetOptions(), + TestXherk::RunRoutine, TestXherk::RunReference, + TestXherk::DownloadResult, TestXherk::GetResultIndex, + TestXherk::ResultID1, TestXherk::ResultID2}; + + // This variable holds the arguments relevant for this routine + auto args = Arguments{}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { args.layout = layout; + for (auto &triangle: tester.kTriangles) { args.triangle = triangle; + for (auto &a_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a + args.a_transpose = a_transpose; // valid BLAS option + + // Creates the arguments vector for the regular tests + auto regular_test_vector = std::vector>{}; + for (auto &n: tester.kMatrixDims) { args.n = n; + for (auto &k: tester.kMatrixDims) { args.k = k; + for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; + for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; + for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; + for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; + for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; + for (auto &beta: tester.kBetaValues) { args.beta = beta; + args.a_size = TestXherk::GetSizeA(args); + args.c_size = TestXherk::GetSizeC(args); + if (args.a_size<1 || args.c_size<1) { continue; } + regular_test_vector.push_back(args); + } + } + } + } + } + } + } + } + + // Creates the arguments vector for the invalid-buffer tests + auto invalid_test_vector = std::vector>{}; + args.n = args.k = tester.kBufferSize; + args.a_ld = args.c_ld = tester.kBufferSize; + args.a_offset = args.c_offset = 0; + for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; + for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; + invalid_test_vector.push_back(args); + } + } + + // Runs the tests + const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose); + tester.TestRegular(regular_test_vector, case_name); + tester.TestInvalid(invalid_test_vector, case_name); + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTest(argc, argv, false, "CHERK"); + clblast::RunTest(argc, argv, true, "ZHERK"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xsymm.cc b/test/correctness/routines/level3/xsymm.cc new file mode 100644 index 00000000..851efff2 --- /dev/null +++ b/test/correctness/routines/level3/xsymm.cc @@ -0,0 +1,100 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xsymm routine. +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level3/xsymm.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester +template +void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates a tester + TestBlas tester{argc, argv, silent, name, TestXsymm::GetOptions(), + TestXsymm::RunRoutine, TestXsymm::RunReference, + TestXsymm::DownloadResult, TestXsymm::GetResultIndex, + TestXsymm::ResultID1, TestXsymm::ResultID2}; + + // This variable holds the arguments relevant for this routine + auto args = Arguments{}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { args.layout = layout; + for (auto &side: tester.kSides) { args.side = side; + for (auto &triangle: tester.kTriangles) { args.triangle = triangle; + + // Creates the arguments vector for the regular tests + auto regular_test_vector = std::vector>{}; + for (auto &m: tester.kMatrixDims) { args.m = m; + for (auto &n: tester.kMatrixDims) { args.n = n; + for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; + for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; + for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld; + for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset; + for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; + for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; + for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; + for (auto &beta: tester.kBetaValues) { args.beta = beta; + args.a_size = TestXsymm::GetSizeA(args); + args.b_size = TestXsymm::GetSizeB(args); + args.c_size = TestXsymm::GetSizeC(args); + if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; } + regular_test_vector.push_back(args); + } + } + } + } + } + } + } + } + } + } + + // Creates the arguments vector for the invalid-buffer tests + auto invalid_test_vector = std::vector>{}; + args.m = args.n = tester.kBufferSize; + args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize; + args.a_offset = args.b_offset = args.c_offset = 0; + for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; + for (auto &b_size: tester.kMatSizes) { args.b_size = b_size; + for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; + invalid_test_vector.push_back(args); + } + } + } + + // Runs the tests + const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle); + tester.TestRegular(regular_test_vector, case_name); + tester.TestInvalid(invalid_test_vector, case_name); + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTest(argc, argv, false, "SSYMM"); + clblast::RunTest(argc, argv, true, "DSYMM"); + clblast::RunTest(argc, argv, true, "CSYMM"); + clblast::RunTest(argc, argv, true, "ZSYMM"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xsyr2k.cc b/test/correctness/routines/level3/xsyr2k.cc new file mode 100644 index 00000000..61ea59a3 --- /dev/null +++ b/test/correctness/routines/level3/xsyr2k.cc @@ -0,0 +1,102 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xsyr2k routine. +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level3/xsyr2k.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester +template +void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates a tester + TestBlas tester{argc, argv, silent, name, TestXsyr2k::GetOptions(), + TestXsyr2k::RunRoutine, TestXsyr2k::RunReference, + TestXsyr2k::DownloadResult, TestXsyr2k::GetResultIndex, + TestXsyr2k::ResultID1, TestXsyr2k::ResultID2}; + + // This variable holds the arguments relevant for this routine + auto args = Arguments{}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { args.layout = layout; + for (auto &triangle: tester.kTriangles) { args.triangle = triangle; + for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it + args.a_transpose = ab_transpose; // is not supported by clBLAS + args.b_transpose = ab_transpose; + + // Creates the arguments vector for the regular tests + auto regular_test_vector = std::vector>{}; + for (auto &n: tester.kMatrixDims) { args.n = n; + for (auto &k: tester.kMatrixDims) { args.k = k; + for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; + for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; + for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld; + for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset; + for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; + for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; + for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; + for (auto &beta: tester.kBetaValues) { args.beta = beta; + args.a_size = TestXsyr2k::GetSizeA(args); + args.b_size = TestXsyr2k::GetSizeB(args); + args.c_size = TestXsyr2k::GetSizeC(args); + if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; } + regular_test_vector.push_back(args); + } + } + } + } + } + } + } + } + } + } + + // Creates the arguments vector for the invalid-buffer tests + auto invalid_test_vector = std::vector>{}; + args.n = args.k = tester.kBufferSize; + args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize; + args.a_offset = args.b_offset = args.c_offset = 0; + for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; + for (auto &b_size: tester.kMatSizes) { args.b_size = b_size; + for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; + invalid_test_vector.push_back(args); + } + } + } + + // Runs the tests + const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose); + tester.TestRegular(regular_test_vector, case_name); + tester.TestInvalid(invalid_test_vector, case_name); + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTest(argc, argv, false, "SSYR2K"); + clblast::RunTest(argc, argv, true, "DSYR2K"); + clblast::RunTest(argc, argv, true, "CSYR2K"); + clblast::RunTest(argc, argv, true, "ZSYR2K"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xsyrk.cc b/test/correctness/routines/level3/xsyrk.cc new file mode 100644 index 00000000..126e201b --- /dev/null +++ b/test/correctness/routines/level3/xsyrk.cc @@ -0,0 +1,94 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xsyrk routine. +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level3/xsyrk.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester +template +void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates a tester + TestBlas tester{argc, argv, silent, name, TestXsyrk::GetOptions(), + TestXsyrk::RunRoutine, TestXsyrk::RunReference, + TestXsyrk::DownloadResult, TestXsyrk::GetResultIndex, + TestXsyrk::ResultID1, TestXsyrk::ResultID2}; + + // This variable holds the arguments relevant for this routine + auto args = Arguments{}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { args.layout = layout; + for (auto &triangle: tester.kTriangles) { args.triangle = triangle; + for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it + args.a_transpose = a_transpose; // is not supported by clBLAS + + // Creates the arguments vector for the regular tests + auto regular_test_vector = std::vector>{}; + for (auto &n: tester.kMatrixDims) { args.n = n; + for (auto &k: tester.kMatrixDims) { args.k = k; + for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; + for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; + for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; + for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; + for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; + for (auto &beta: tester.kBetaValues) { args.beta = beta; + args.a_size = TestXsyrk::GetSizeA(args); + args.c_size = TestXsyrk::GetSizeC(args); + if (args.a_size<1 || args.c_size<1) { continue; } + regular_test_vector.push_back(args); + } + } + } + } + } + } + } + } + + // Creates the arguments vector for the invalid-buffer tests + auto invalid_test_vector = std::vector>{}; + args.n = args.k = tester.kBufferSize; + args.a_ld = args.c_ld = tester.kBufferSize; + args.a_offset = args.c_offset = 0; + for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; + for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; + invalid_test_vector.push_back(args); + } + } + + // Runs the tests + const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose); + tester.TestRegular(regular_test_vector, case_name); + tester.TestInvalid(invalid_test_vector, case_name); + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTest(argc, argv, false, "SSYRK"); + clblast::RunTest(argc, argv, true, "DSYRK"); + clblast::RunTest(argc, argv, true, "CSYRK"); + clblast::RunTest(argc, argv, true, "ZSYRK"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xtrmm.cc b/test/correctness/routines/level3/xtrmm.cc new file mode 100644 index 00000000..5f04bb18 --- /dev/null +++ b/test/correctness/routines/level3/xtrmm.cc @@ -0,0 +1,96 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xtrmm routine. +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level3/xtrmm.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester +template +void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates a tester + TestBlas tester{argc, argv, silent, name, TestXtrmm::GetOptions(), + TestXtrmm::RunRoutine, TestXtrmm::RunReference, + TestXtrmm::DownloadResult, TestXtrmm::GetResultIndex, + TestXtrmm::ResultID1, TestXtrmm::ResultID2}; + + // This variable holds the arguments relevant for this routine + auto args = Arguments{}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { args.layout = layout; + for (auto &side: tester.kSides) { args.side = side; + for (auto &triangle: tester.kTriangles) { args.triangle = triangle; + for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose; + for (auto &diagonal: tester.kDiagonals) { args.diagonal = diagonal; + + // Creates the arguments vector for the regular tests + auto regular_test_vector = std::vector>{}; + for (auto &m: tester.kMatrixDims) { args.m = m; + for (auto &n: tester.kMatrixDims) { args.n = n; + for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; + for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; + for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld; + for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset; + for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; + args.a_size = TestXtrmm::GetSizeA(args); + args.b_size = TestXtrmm::GetSizeB(args); + if (args.a_size<1 || args.b_size<1) { continue; } + regular_test_vector.push_back(args); + } + } + } + } + } + } + } + + // Creates the arguments vector for the invalid-buffer tests + auto invalid_test_vector = std::vector>{}; + args.m = args.n = tester.kBufferSize; + args.a_ld = args.b_ld = tester.kBufferSize; + args.a_offset = args.b_offset = 0; + for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; + for (auto &b_size: tester.kMatSizes) { args.b_size = b_size; + invalid_test_vector.push_back(args); + } + } + + // Runs the tests + const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle)+" "+ + ToString(a_transpose)+" "+ToString(diagonal); + tester.TestRegular(regular_test_vector, case_name); + tester.TestInvalid(invalid_test_vector, case_name); + } + } + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTest(argc, argv, false, "STRMM"); + clblast::RunTest(argc, argv, true, "DTRMM"); + clblast::RunTest(argc, argv, true, "CTRMM"); + clblast::RunTest(argc, argv, true, "ZTRMM"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/xaxpy.cc b/test/correctness/routines/xaxpy.cc deleted file mode 100644 index cf23ca9f..00000000 --- a/test/correctness/routines/xaxpy.cc +++ /dev/null @@ -1,81 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the tests for the Xaxpy routine. -// -// ================================================================================================= - -#include "correctness/testblas.h" -#include "routines/xaxpy.h" - -namespace clblast { -// ================================================================================================= - -// The correctness tester -template -void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { - - // Creates a tester - TestBlas tester{argc, argv, silent, name, TestXaxpy::GetOptions(), - TestXaxpy::RunRoutine, TestXaxpy::RunReference, - TestXaxpy::DownloadResult, TestXaxpy::GetResultIndex, - TestXaxpy::ResultID1, TestXaxpy::ResultID2}; - - // This variable holds the arguments relevant for this routine - auto args = Arguments{}; - - // Creates the arguments vector for the regular tests - auto regular_test_vector = std::vector>{}; - for (auto &n: tester.kVectorDims) { args.n = n; - for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc; - for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset; - for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc; - for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset; - for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; - args.x_size = TestXaxpy::GetSizeX(args); - args.y_size = TestXaxpy::GetSizeY(args); - if (args.x_size<1 || args.y_size<1) { continue; } - regular_test_vector.push_back(args); - } - } - } - } - } - } - - // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector>{}; - args.n = tester.kBufferSize; - args.x_inc = args.y_inc = 1; - args.x_offset = args.y_offset = 0; - for (auto &x_size: tester.kVecSizes) { args.x_size = x_size; - for (auto &y_size: tester.kVecSizes) { args.y_size = y_size; - invalid_test_vector.push_back(args); - } - } - - // Runs the tests - const auto case_name = "default"; - tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); -} - -// ================================================================================================= -} // namespace clblast - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - clblast::RunTest(argc, argv, false, "SAXPY"); - clblast::RunTest(argc, argv, true, "DAXPY"); - clblast::RunTest(argc, argv, true, "CAXPY"); - clblast::RunTest(argc, argv, true, "ZAXPY"); - return 0; -} - -// ================================================================================================= diff --git a/test/correctness/routines/xgemm.cc b/test/correctness/routines/xgemm.cc deleted file mode 100644 index 8a50e1ca..00000000 --- a/test/correctness/routines/xgemm.cc +++ /dev/null @@ -1,102 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the tests for the Xgemm routine. -// -// ================================================================================================= - -#include "correctness/testblas.h" -#include "routines/xgemm.h" - -namespace clblast { -// ================================================================================================= - -// The correctness tester -template -void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { - - // Creates a tester - TestBlas tester{argc, argv, silent, name, TestXgemm::GetOptions(), - TestXgemm::RunRoutine, TestXgemm::RunReference, - TestXgemm::DownloadResult, TestXgemm::GetResultIndex, - TestXgemm::ResultID1, TestXgemm::ResultID2}; - - // This variable holds the arguments relevant for this routine - auto args = Arguments{}; - - // Loops over the test-cases from a data-layout point of view - for (auto &layout: tester.kLayouts) { args.layout = layout; - for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose; - for (auto &b_transpose: tester.kTransposes) { args.b_transpose = b_transpose; - - // Creates the arguments vector for the regular tests - auto regular_test_vector = std::vector>{}; - for (auto &m: tester.kMatrixDims) { args.m = m; - for (auto &n: tester.kMatrixDims) { args.n = n; - for (auto &k: tester.kMatrixDims) { args.k = k; - for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; - for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; - for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld; - for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset; - for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; - for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; - for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; - for (auto &beta: tester.kBetaValues) { args.beta = beta; - args.a_size = TestXgemm::GetSizeA(args); - args.b_size = TestXgemm::GetSizeB(args); - args.c_size = TestXgemm::GetSizeC(args); - if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; } - regular_test_vector.push_back(args); - } - } - } - } - } - } - } - } - } - } - } - - // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector>{}; - args.m = args.n = args.k = tester.kBufferSize; - args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize; - args.a_offset = args.b_offset = args.c_offset = 0; - for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; - for (auto &b_size: tester.kMatSizes) { args.b_size = b_size; - for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; - invalid_test_vector.push_back(args); - } - } - } - - // Runs the tests - const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose); - tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); - } - } - } -} - -// ================================================================================================= -} // namespace clblast - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - clblast::RunTest(argc, argv, false, "SGEMM"); - clblast::RunTest(argc, argv, true, "DGEMM"); - clblast::RunTest(argc, argv, true, "CGEMM"); - clblast::RunTest(argc, argv, true, "ZGEMM"); - return 0; -} - -// ================================================================================================= diff --git a/test/correctness/routines/xgemv.cc b/test/correctness/routines/xgemv.cc deleted file mode 100644 index 50ce4699..00000000 --- a/test/correctness/routines/xgemv.cc +++ /dev/null @@ -1,99 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the tests for the Xgemv routine. -// -// ================================================================================================= - -#include "correctness/testblas.h" -#include "routines/xgemv.h" - -namespace clblast { -// ================================================================================================= - -// The correctness tester -template -void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { - - // Creates a tester - TestBlas tester{argc, argv, silent, name, TestXgemv::GetOptions(), - TestXgemv::RunRoutine, TestXgemv::RunReference, - TestXgemv::DownloadResult, TestXgemv::GetResultIndex, - TestXgemv::ResultID1, TestXgemv::ResultID2}; - - // This variable holds the arguments relevant for this routine - auto args = Arguments{}; - - // Loops over the test-cases from a data-layout point of view - for (auto &layout: tester.kLayouts) { args.layout = layout; - for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose; - - // Creates the arguments vector for the regular tests - auto regular_test_vector = std::vector>{}; - for (auto &m: tester.kMatrixVectorDims) { args.m = m; - for (auto &n: tester.kMatrixVectorDims) { args.n = n; - for (auto &a_ld: tester.kMatrixVectorDims) { args.a_ld = a_ld; - for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; - for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc; - for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset; - for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc; - for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset; - for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; - for (auto &beta: tester.kBetaValues) { args.beta = beta; - args.a_size = TestXgemv::GetSizeA(args); - args.x_size = TestXgemv::GetSizeX(args); - args.y_size = TestXgemv::GetSizeY(args); - if (args.a_size<1 || args.x_size<1 || args.y_size<1) { continue; } - regular_test_vector.push_back(args); - } - } - } - } - } - } - } - } - } - } - - // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector>{}; - args.m = args.n = tester.kBufferSize; - args.a_ld = tester.kBufferSize; - args.x_inc = args.y_inc = 1; - args.a_offset = args.x_offset = args.y_offset = 0; - for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; - for (auto &x_size: tester.kVecSizes) { args.x_size = x_size; - for (auto &y_size: tester.kVecSizes) { args.y_size = y_size; - invalid_test_vector.push_back(args); - } - } - } - - // Runs the tests - const auto case_name = ToString(layout)+" "+ToString(a_transpose); - tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); - } - } -} - -// ================================================================================================= -} // namespace clblast - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - clblast::RunTest(argc, argv, false, "SGEMV"); - clblast::RunTest(argc, argv, true, "DGEMV"); - clblast::RunTest(argc, argv, true, "CGEMV"); - clblast::RunTest(argc, argv, true, "ZGEMV"); - return 0; -} - -// ================================================================================================= diff --git a/test/correctness/routines/xhemm.cc b/test/correctness/routines/xhemm.cc deleted file mode 100644 index e8c82f65..00000000 --- a/test/correctness/routines/xhemm.cc +++ /dev/null @@ -1,98 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the tests for the Xhemm routine. -// -// ================================================================================================= - -#include "correctness/testblas.h" -#include "routines/xhemm.h" - -namespace clblast { -// ================================================================================================= - -// The correctness tester -template -void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { - - // Creates a tester - TestBlas tester{argc, argv, silent, name, TestXhemm::GetOptions(), - TestXhemm::RunRoutine, TestXhemm::RunReference, - TestXhemm::DownloadResult, TestXhemm::GetResultIndex, - TestXhemm::ResultID1, TestXhemm::ResultID2}; - - // This variable holds the arguments relevant for this routine - auto args = Arguments{}; - - // Loops over the test-cases from a data-layout point of view - for (auto &layout: tester.kLayouts) { args.layout = layout; - for (auto &side: tester.kSides) { args.side = side; - for (auto &triangle: tester.kTriangles) { args.triangle = triangle; - - // Creates the arguments vector for the regular tests - auto regular_test_vector = std::vector>{}; - for (auto &m: tester.kMatrixDims) { args.m = m; - for (auto &n: tester.kMatrixDims) { args.n = n; - for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; - for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; - for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld; - for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset; - for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; - for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; - for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; - for (auto &beta: tester.kBetaValues) { args.beta = beta; - args.a_size = TestXhemm::GetSizeA(args); - args.b_size = TestXhemm::GetSizeB(args); - args.c_size = TestXhemm::GetSizeC(args); - if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; } - regular_test_vector.push_back(args); - } - } - } - } - } - } - } - } - } - } - - // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector>{}; - args.m = args.n = tester.kBufferSize; - args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize; - args.a_offset = args.b_offset = args.c_offset = 0; - for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; - for (auto &b_size: tester.kMatSizes) { args.b_size = b_size; - for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; - invalid_test_vector.push_back(args); - } - } - } - - // Runs the tests - const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle); - tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); - } - } - } -} - -// ================================================================================================= -} // namespace clblast - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - clblast::RunTest(argc, argv, true, "CHEMM"); - clblast::RunTest(argc, argv, true, "ZHEMM"); - return 0; -} - -// ================================================================================================= diff --git a/test/correctness/routines/xher2k.cc b/test/correctness/routines/xher2k.cc deleted file mode 100644 index 7c0e5a92..00000000 --- a/test/correctness/routines/xher2k.cc +++ /dev/null @@ -1,100 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the tests for the Xher2k routine. -// -// ================================================================================================= - -#include "correctness/testblas.h" -#include "routines/xher2k.h" - -namespace clblast { -// ================================================================================================= - -// The correctness tester -template -void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { - - // Creates a tester - TestBlas tester{argc, argv, silent, name, TestXher2k::GetOptions(), - TestXher2k::RunRoutine, TestXher2k::RunReference, - TestXher2k::DownloadResult, TestXher2k::GetResultIndex, - TestXher2k::ResultID1, TestXher2k::ResultID2}; - - // This variable holds the arguments relevant for this routine - auto args = Arguments{}; - - // Loops over the test-cases from a data-layout point of view - for (auto &layout: tester.kLayouts) { args.layout = layout; - for (auto &triangle: tester.kTriangles) { args.triangle = triangle; - for (auto &ab_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a - args.a_transpose = ab_transpose; // valid BLAS option - args.b_transpose = ab_transpose; - - // Creates the arguments vector for the regular tests - auto regular_test_vector = std::vector>{}; - for (auto &n: tester.kMatrixDims) { args.n = n; - for (auto &k: tester.kMatrixDims) { args.k = k; - for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; - for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; - for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld; - for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset; - for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; - for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; - for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; - for (auto &beta: tester.kBetaValues) { args.beta = beta; - args.a_size = TestXher2k::GetSizeA(args); - args.b_size = TestXher2k::GetSizeB(args); - args.c_size = TestXher2k::GetSizeC(args); - if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; } - regular_test_vector.push_back(args); - } - } - } - } - } - } - } - } - } - } - - // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector>{}; - args.n = args.k = tester.kBufferSize; - args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize; - args.a_offset = args.b_offset = args.c_offset = 0; - for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; - for (auto &b_size: tester.kMatSizes) { args.b_size = b_size; - for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; - invalid_test_vector.push_back(args); - } - } - } - - // Runs the tests - const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose); - tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); - } - } - } -} - -// ================================================================================================= -} // namespace clblast - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - clblast::RunTest(argc, argv, false, "CHER2K"); - clblast::RunTest(argc, argv, true, "ZHER2K"); - return 0; -} - -// ================================================================================================= diff --git a/test/correctness/routines/xherk.cc b/test/correctness/routines/xherk.cc deleted file mode 100644 index dc5c6caf..00000000 --- a/test/correctness/routines/xherk.cc +++ /dev/null @@ -1,92 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the tests for the Xherk routine. -// -// ================================================================================================= - -#include "correctness/testblas.h" -#include "routines/xherk.h" - -namespace clblast { -// ================================================================================================= - -// The correctness tester -template -void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { - - // Creates a tester - TestBlas tester{argc, argv, silent, name, TestXherk::GetOptions(), - TestXherk::RunRoutine, TestXherk::RunReference, - TestXherk::DownloadResult, TestXherk::GetResultIndex, - TestXherk::ResultID1, TestXherk::ResultID2}; - - // This variable holds the arguments relevant for this routine - auto args = Arguments{}; - - // Loops over the test-cases from a data-layout point of view - for (auto &layout: tester.kLayouts) { args.layout = layout; - for (auto &triangle: tester.kTriangles) { args.triangle = triangle; - for (auto &a_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a - args.a_transpose = a_transpose; // valid BLAS option - - // Creates the arguments vector for the regular tests - auto regular_test_vector = std::vector>{}; - for (auto &n: tester.kMatrixDims) { args.n = n; - for (auto &k: tester.kMatrixDims) { args.k = k; - for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; - for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; - for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; - for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; - for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; - for (auto &beta: tester.kBetaValues) { args.beta = beta; - args.a_size = TestXherk::GetSizeA(args); - args.c_size = TestXherk::GetSizeC(args); - if (args.a_size<1 || args.c_size<1) { continue; } - regular_test_vector.push_back(args); - } - } - } - } - } - } - } - } - - // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector>{}; - args.n = args.k = tester.kBufferSize; - args.a_ld = args.c_ld = tester.kBufferSize; - args.a_offset = args.c_offset = 0; - for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; - for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; - invalid_test_vector.push_back(args); - } - } - - // Runs the tests - const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose); - tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); - } - } - } -} - -// ================================================================================================= -} // namespace clblast - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - clblast::RunTest(argc, argv, false, "CHERK"); - clblast::RunTest(argc, argv, true, "ZHERK"); - return 0; -} - -// ================================================================================================= diff --git a/test/correctness/routines/xsymm.cc b/test/correctness/routines/xsymm.cc deleted file mode 100644 index a919a056..00000000 --- a/test/correctness/routines/xsymm.cc +++ /dev/null @@ -1,100 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the tests for the Xsymm routine. -// -// ================================================================================================= - -#include "correctness/testblas.h" -#include "routines/xsymm.h" - -namespace clblast { -// ================================================================================================= - -// The correctness tester -template -void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { - - // Creates a tester - TestBlas tester{argc, argv, silent, name, TestXsymm::GetOptions(), - TestXsymm::RunRoutine, TestXsymm::RunReference, - TestXsymm::DownloadResult, TestXsymm::GetResultIndex, - TestXsymm::ResultID1, TestXsymm::ResultID2}; - - // This variable holds the arguments relevant for this routine - auto args = Arguments{}; - - // Loops over the test-cases from a data-layout point of view - for (auto &layout: tester.kLayouts) { args.layout = layout; - for (auto &side: tester.kSides) { args.side = side; - for (auto &triangle: tester.kTriangles) { args.triangle = triangle; - - // Creates the arguments vector for the regular tests - auto regular_test_vector = std::vector>{}; - for (auto &m: tester.kMatrixDims) { args.m = m; - for (auto &n: tester.kMatrixDims) { args.n = n; - for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; - for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; - for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld; - for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset; - for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; - for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; - for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; - for (auto &beta: tester.kBetaValues) { args.beta = beta; - args.a_size = TestXsymm::GetSizeA(args); - args.b_size = TestXsymm::GetSizeB(args); - args.c_size = TestXsymm::GetSizeC(args); - if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; } - regular_test_vector.push_back(args); - } - } - } - } - } - } - } - } - } - } - - // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector>{}; - args.m = args.n = tester.kBufferSize; - args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize; - args.a_offset = args.b_offset = args.c_offset = 0; - for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; - for (auto &b_size: tester.kMatSizes) { args.b_size = b_size; - for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; - invalid_test_vector.push_back(args); - } - } - } - - // Runs the tests - const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle); - tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); - } - } - } -} - -// ================================================================================================= -} // namespace clblast - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - clblast::RunTest(argc, argv, false, "SSYMM"); - clblast::RunTest(argc, argv, true, "DSYMM"); - clblast::RunTest(argc, argv, true, "CSYMM"); - clblast::RunTest(argc, argv, true, "ZSYMM"); - return 0; -} - -// ================================================================================================= diff --git a/test/correctness/routines/xsyr2k.cc b/test/correctness/routines/xsyr2k.cc deleted file mode 100644 index 736aa4e5..00000000 --- a/test/correctness/routines/xsyr2k.cc +++ /dev/null @@ -1,102 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the tests for the Xsyr2k routine. -// -// ================================================================================================= - -#include "correctness/testblas.h" -#include "routines/xsyr2k.h" - -namespace clblast { -// ================================================================================================= - -// The correctness tester -template -void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { - - // Creates a tester - TestBlas tester{argc, argv, silent, name, TestXsyr2k::GetOptions(), - TestXsyr2k::RunRoutine, TestXsyr2k::RunReference, - TestXsyr2k::DownloadResult, TestXsyr2k::GetResultIndex, - TestXsyr2k::ResultID1, TestXsyr2k::ResultID2}; - - // This variable holds the arguments relevant for this routine - auto args = Arguments{}; - - // Loops over the test-cases from a data-layout point of view - for (auto &layout: tester.kLayouts) { args.layout = layout; - for (auto &triangle: tester.kTriangles) { args.triangle = triangle; - for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it - args.a_transpose = ab_transpose; // is not supported by clBLAS - args.b_transpose = ab_transpose; - - // Creates the arguments vector for the regular tests - auto regular_test_vector = std::vector>{}; - for (auto &n: tester.kMatrixDims) { args.n = n; - for (auto &k: tester.kMatrixDims) { args.k = k; - for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; - for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; - for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld; - for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset; - for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; - for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; - for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; - for (auto &beta: tester.kBetaValues) { args.beta = beta; - args.a_size = TestXsyr2k::GetSizeA(args); - args.b_size = TestXsyr2k::GetSizeB(args); - args.c_size = TestXsyr2k::GetSizeC(args); - if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; } - regular_test_vector.push_back(args); - } - } - } - } - } - } - } - } - } - } - - // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector>{}; - args.n = args.k = tester.kBufferSize; - args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize; - args.a_offset = args.b_offset = args.c_offset = 0; - for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; - for (auto &b_size: tester.kMatSizes) { args.b_size = b_size; - for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; - invalid_test_vector.push_back(args); - } - } - } - - // Runs the tests - const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose); - tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); - } - } - } -} - -// ================================================================================================= -} // namespace clblast - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - clblast::RunTest(argc, argv, false, "SSYR2K"); - clblast::RunTest(argc, argv, true, "DSYR2K"); - clblast::RunTest(argc, argv, true, "CSYR2K"); - clblast::RunTest(argc, argv, true, "ZSYR2K"); - return 0; -} - -// ================================================================================================= diff --git a/test/correctness/routines/xsyrk.cc b/test/correctness/routines/xsyrk.cc deleted file mode 100644 index a62a0ebf..00000000 --- a/test/correctness/routines/xsyrk.cc +++ /dev/null @@ -1,94 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the tests for the Xsyrk routine. -// -// ================================================================================================= - -#include "correctness/testblas.h" -#include "routines/xsyrk.h" - -namespace clblast { -// ================================================================================================= - -// The correctness tester -template -void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { - - // Creates a tester - TestBlas tester{argc, argv, silent, name, TestXsyrk::GetOptions(), - TestXsyrk::RunRoutine, TestXsyrk::RunReference, - TestXsyrk::DownloadResult, TestXsyrk::GetResultIndex, - TestXsyrk::ResultID1, TestXsyrk::ResultID2}; - - // This variable holds the arguments relevant for this routine - auto args = Arguments{}; - - // Loops over the test-cases from a data-layout point of view - for (auto &layout: tester.kLayouts) { args.layout = layout; - for (auto &triangle: tester.kTriangles) { args.triangle = triangle; - for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it - args.a_transpose = a_transpose; // is not supported by clBLAS - - // Creates the arguments vector for the regular tests - auto regular_test_vector = std::vector>{}; - for (auto &n: tester.kMatrixDims) { args.n = n; - for (auto &k: tester.kMatrixDims) { args.k = k; - for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; - for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; - for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld; - for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset; - for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; - for (auto &beta: tester.kBetaValues) { args.beta = beta; - args.a_size = TestXsyrk::GetSizeA(args); - args.c_size = TestXsyrk::GetSizeC(args); - if (args.a_size<1 || args.c_size<1) { continue; } - regular_test_vector.push_back(args); - } - } - } - } - } - } - } - } - - // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector>{}; - args.n = args.k = tester.kBufferSize; - args.a_ld = args.c_ld = tester.kBufferSize; - args.a_offset = args.c_offset = 0; - for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; - for (auto &c_size: tester.kMatSizes) { args.c_size = c_size; - invalid_test_vector.push_back(args); - } - } - - // Runs the tests - const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose); - tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); - } - } - } -} - -// ================================================================================================= -} // namespace clblast - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - clblast::RunTest(argc, argv, false, "SSYRK"); - clblast::RunTest(argc, argv, true, "DSYRK"); - clblast::RunTest(argc, argv, true, "CSYRK"); - clblast::RunTest(argc, argv, true, "ZSYRK"); - return 0; -} - -// ================================================================================================= diff --git a/test/correctness/routines/xtrmm.cc b/test/correctness/routines/xtrmm.cc deleted file mode 100644 index 0bb6294c..00000000 --- a/test/correctness/routines/xtrmm.cc +++ /dev/null @@ -1,96 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the tests for the Xtrmm routine. -// -// ================================================================================================= - -#include "correctness/testblas.h" -#include "routines/xtrmm.h" - -namespace clblast { -// ================================================================================================= - -// The correctness tester -template -void RunTest(int argc, char *argv[], const bool silent, const std::string &name) { - - // Creates a tester - TestBlas tester{argc, argv, silent, name, TestXtrmm::GetOptions(), - TestXtrmm::RunRoutine, TestXtrmm::RunReference, - TestXtrmm::DownloadResult, TestXtrmm::GetResultIndex, - TestXtrmm::ResultID1, TestXtrmm::ResultID2}; - - // This variable holds the arguments relevant for this routine - auto args = Arguments{}; - - // Loops over the test-cases from a data-layout point of view - for (auto &layout: tester.kLayouts) { args.layout = layout; - for (auto &side: tester.kSides) { args.side = side; - for (auto &triangle: tester.kTriangles) { args.triangle = triangle; - for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose; - for (auto &diagonal: tester.kDiagonals) { args.diagonal = diagonal; - - // Creates the arguments vector for the regular tests - auto regular_test_vector = std::vector>{}; - for (auto &m: tester.kMatrixDims) { args.m = m; - for (auto &n: tester.kMatrixDims) { args.n = n; - for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld; - for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset; - for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld; - for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset; - for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha; - args.a_size = TestXtrmm::GetSizeA(args); - args.b_size = TestXtrmm::GetSizeB(args); - if (args.a_size<1 || args.b_size<1) { continue; } - regular_test_vector.push_back(args); - } - } - } - } - } - } - } - - // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector>{}; - args.m = args.n = tester.kBufferSize; - args.a_ld = args.b_ld = tester.kBufferSize; - args.a_offset = args.b_offset = 0; - for (auto &a_size: tester.kMatSizes) { args.a_size = a_size; - for (auto &b_size: tester.kMatSizes) { args.b_size = b_size; - invalid_test_vector.push_back(args); - } - } - - // Runs the tests - const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle)+" "+ - ToString(a_transpose)+" "+ToString(diagonal); - tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); - } - } - } - } - } -} - -// ================================================================================================= -} // namespace clblast - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - clblast::RunTest(argc, argv, false, "STRMM"); - clblast::RunTest(argc, argv, true, "DTRMM"); - clblast::RunTest(argc, argv, true, "CTRMM"); - clblast::RunTest(argc, argv, true, "ZTRMM"); - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xaxpy.cc b/test/performance/routines/level1/xaxpy.cc new file mode 100644 index 00000000..fe90c697 --- /dev/null +++ b/test/performance/routines/level1/xaxpy.cc @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xaxpy command-line interface performance tester. +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xaxpy.h" + +// ================================================================================================= + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xgemv.cc b/test/performance/routines/level2/xgemv.cc new file mode 100644 index 00000000..376c6c33 --- /dev/null +++ b/test/performance/routines/level2/xgemv.cc @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgemv command-line interface performance tester. +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xgemv.h" + +// ================================================================================================= + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xgemm.cc b/test/performance/routines/level3/xgemm.cc new file mode 100644 index 00000000..c45c238f --- /dev/null +++ b/test/performance/routines/level3/xgemm.cc @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgemm command-line interface performance tester. +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level3/xgemm.h" + +// ================================================================================================= + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xhemm.cc b/test/performance/routines/level3/xhemm.cc new file mode 100644 index 00000000..d215653b --- /dev/null +++ b/test/performance/routines/level3/xhemm.cc @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhemm command-line interface performance tester. +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level3/xhemm.h" + +// ================================================================================================= + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xher2k.cc b/test/performance/routines/level3/xher2k.cc new file mode 100644 index 00000000..2e1f248a --- /dev/null +++ b/test/performance/routines/level3/xher2k.cc @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xher2k command-line interface performance tester. +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level3/xher2k.h" + +// ================================================================================================= + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xherk.cc b/test/performance/routines/level3/xherk.cc new file mode 100644 index 00000000..4386f78c --- /dev/null +++ b/test/performance/routines/level3/xherk.cc @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xherk command-line interface performance tester. +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level3/xherk.h" + +// ================================================================================================= + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xsymm.cc b/test/performance/routines/level3/xsymm.cc new file mode 100644 index 00000000..bd014cee --- /dev/null +++ b/test/performance/routines/level3/xsymm.cc @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsymm command-line interface performance tester. +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level3/xsymm.h" + +// ================================================================================================= + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xsyr2k.cc b/test/performance/routines/level3/xsyr2k.cc new file mode 100644 index 00000000..1261be88 --- /dev/null +++ b/test/performance/routines/level3/xsyr2k.cc @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2k command-line interface performance tester. +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level3/xsyr2k.h" + +// ================================================================================================= + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xsyrk.cc b/test/performance/routines/level3/xsyrk.cc new file mode 100644 index 00000000..5799130f --- /dev/null +++ b/test/performance/routines/level3/xsyrk.cc @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyrk command-line interface performance tester. +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level3/xsyrk.h" + +// ================================================================================================= + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xtrmm.cc b/test/performance/routines/level3/xtrmm.cc new file mode 100644 index 00000000..c30866e9 --- /dev/null +++ b/test/performance/routines/level3/xtrmm.cc @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrmm command-line interface performance tester. +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level3/xtrmm.h" + +// ================================================================================================= + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: + throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/xaxpy.cc b/test/performance/routines/xaxpy.cc deleted file mode 100644 index 6a2b96c1..00000000 --- a/test/performance/routines/xaxpy.cc +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xaxpy command-line interface performance tester. -// -// ================================================================================================= - -#include "performance/client.h" -#include "routines/xaxpy.h" - -// ================================================================================================= - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/xgemm.cc b/test/performance/routines/xgemm.cc deleted file mode 100644 index 9a02e595..00000000 --- a/test/performance/routines/xgemm.cc +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgemm command-line interface performance tester. -// -// ================================================================================================= - -#include "performance/client.h" -#include "routines/xgemm.h" - -// ================================================================================================= - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/xgemv.cc b/test/performance/routines/xgemv.cc deleted file mode 100644 index 6f69ef21..00000000 --- a/test/performance/routines/xgemv.cc +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgemv command-line interface performance tester. -// -// ================================================================================================= - -#include "performance/client.h" -#include "routines/xgemv.h" - -// ================================================================================================= - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/xhemm.cc b/test/performance/routines/xhemm.cc deleted file mode 100644 index 34798d8d..00000000 --- a/test/performance/routines/xhemm.cc +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xhemm command-line interface performance tester. -// -// ================================================================================================= - -#include "performance/client.h" -#include "routines/xhemm.h" - -// ================================================================================================= - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/xher2k.cc b/test/performance/routines/xher2k.cc deleted file mode 100644 index 1b505737..00000000 --- a/test/performance/routines/xher2k.cc +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xher2k command-line interface performance tester. -// -// ================================================================================================= - -#include "performance/client.h" -#include "routines/xher2k.h" - -// ================================================================================================= - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/xherk.cc b/test/performance/routines/xherk.cc deleted file mode 100644 index ce18152e..00000000 --- a/test/performance/routines/xherk.cc +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xherk command-line interface performance tester. -// -// ================================================================================================= - -#include "performance/client.h" -#include "routines/xherk.h" - -// ================================================================================================= - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/xsymm.cc b/test/performance/routines/xsymm.cc deleted file mode 100644 index 8738ceda..00000000 --- a/test/performance/routines/xsymm.cc +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsymm command-line interface performance tester. -// -// ================================================================================================= - -#include "performance/client.h" -#include "routines/xsymm.h" - -// ================================================================================================= - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/xsyr2k.cc b/test/performance/routines/xsyr2k.cc deleted file mode 100644 index e4c76229..00000000 --- a/test/performance/routines/xsyr2k.cc +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsyr2k command-line interface performance tester. -// -// ================================================================================================= - -#include "performance/client.h" -#include "routines/xsyr2k.h" - -// ================================================================================================= - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/xsyrk.cc b/test/performance/routines/xsyrk.cc deleted file mode 100644 index 53fecb69..00000000 --- a/test/performance/routines/xsyrk.cc +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsyrk command-line interface performance tester. -// -// ================================================================================================= - -#include "performance/client.h" -#include "routines/xsyrk.h" - -// ================================================================================================= - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/xtrmm.cc b/test/performance/routines/xtrmm.cc deleted file mode 100644 index 2ab9ce77..00000000 --- a/test/performance/routines/xtrmm.cc +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xtrmm command-line interface performance tester. -// -// ================================================================================================= - -#include "performance/client.h" -#include "routines/xtrmm.h" - -// ================================================================================================= - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.h new file mode 100644 index 00000000..6ce5d7e2 --- /dev/null +++ b/test/routines/level1/xaxpy.h @@ -0,0 +1,113 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xaxpy routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XAXPY_H_ +#define CLBLAST_TEST_ROUTINES_XAXPY_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXaxpy { + public: + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgXInc, kArgYInc, + kArgXOffset, kArgYOffset, + kArgAlpha}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments &args) { + return args.n * args.y_inc + args.y_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Axpy(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXaxpy(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, + CommandQueue &queue) { + std::vector result(args.y_size, static_cast(0)); + buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.n; } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n; + } + static size_t GetBytes(const Arguments &args) { + return (3 * args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XAXPY_H_ +#endif diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h new file mode 100644 index 00000000..73f7d76e --- /dev/null +++ b/test/routines/level2/xgemv.h @@ -0,0 +1,132 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xgemv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XGEMV_H_ +#define CLBLAST_TEST_ROUTINES_XGEMV_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXgemv { + public: + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgM, kArgN, + kArgLayout, kArgATransp, + kArgALeadDim, kArgXInc, kArgYInc, + kArgAOffset, kArgXOffset, kArgYOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments &args) { + auto a_rotated = (args.layout == Layout::kRowMajor); + auto a_two = (a_rotated) ? args.m : args.n; + return a_two * args.a_ld + args.a_offset; + } + static size_t GetSizeX(const Arguments &args) { + auto a_transposed = (args.a_transpose != Transpose::kNo); + auto n_real = (a_transposed) ? args.m : args.n; + return n_real * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments &args) { + auto a_transposed = (args.a_transpose != Transpose::kNo); + auto m_real = (a_transposed) ? args.n : args.m; + return m_real * args.y_inc + args.y_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.n; } + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gemv(args.layout, args.a_transpose, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgemv(static_cast(args.layout), + static_cast(args.a_transpose), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, + CommandQueue &queue) { + std::vector result(args.y_size, static_cast(0)); + buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { + auto a_transposed = (args.a_transpose != Transpose::kNo); + return (a_transposed) ? args.n : args.m; + } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.m * args.n; + } + static size_t GetBytes(const Arguments &args) { + return (args.m*args.n + 2*args.m + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XGEMV_H_ +#endif diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h new file mode 100644 index 00000000..86a304d1 --- /dev/null +++ b/test/routines/level3/xgemm.h @@ -0,0 +1,134 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xgemm routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XGEMM_H_ +#define CLBLAST_TEST_ROUTINES_XGEMM_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXgemm { + public: + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgM, kArgN, kArgK, + kArgLayout, kArgATransp, kArgBTransp, + kArgALeadDim, kArgBLeadDim, kArgCLeadDim, + kArgAOffset, kArgBOffset, kArgCOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments &args) { + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + auto a_two = (a_rotated) ? args.m : args.k; + return a_two * args.a_ld + args.a_offset; + } + static size_t GetSizeB(const Arguments &args) { + auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo); + auto b_two = (b_rotated) ? args.k : args.n; + return b_two * args.b_ld + args.b_offset; + } + static size_t GetSizeC(const Arguments &args) { + auto c_rotated = (args.layout == Layout::kRowMajor); + auto c_two = (c_rotated) ? args.m : args.n; + return c_two * args.c_ld + args.c_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.b_size = GetSizeB(args); + args.c_size = GetSizeC(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.k; } + static size_t DefaultLDB(const Arguments &args) { return args.n; } + static size_t DefaultLDC(const Arguments &args) { return args.n; } + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, + args.m, args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgemm(static_cast(args.layout), + static_cast(args.a_transpose), + static_cast(args.b_transpose), + args.m, args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, + CommandQueue &queue) { + std::vector result(args.c_size, static_cast(0)); + buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.m; } + static size_t ResultID2(const Arguments &args) { return args.n; } + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { + return (args.layout == Layout::kRowMajor) ? + id1*args.c_ld + id2 + args.c_offset: + id2*args.c_ld + id1 + args.c_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.m * args.n * args.m; + } + static size_t GetBytes(const Arguments &args) { + return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XGEMM_H_ +#endif diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h new file mode 100644 index 00000000..75878b06 --- /dev/null +++ b/test/routines/level3/xhemm.h @@ -0,0 +1,134 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xhemm routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XHEMM_H_ +#define CLBLAST_TEST_ROUTINES_XHEMM_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXhemm { + public: + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgM, kArgN, + kArgLayout, kArgSide, kArgTriangle, + kArgALeadDim, kArgBLeadDim, kArgCLeadDim, + kArgAOffset, kArgBOffset, kArgCOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments &args) { + size_t k_value = (args.side == Side::kLeft) ? args.m : args.n; + auto a_rotated = (args.layout == Layout::kRowMajor); + auto a_two = (a_rotated) ? args.m : k_value; + return a_two * args.a_ld + args.a_offset; + } + static size_t GetSizeB(const Arguments &args) { + size_t k_value = (args.side == Side::kLeft) ? args.m : args.n; + auto b_rotated = (args.layout == Layout::kRowMajor); + auto b_two = (b_rotated) ? k_value : args.n; + return b_two * args.b_ld + args.b_offset; + } + static size_t GetSizeC(const Arguments &args) { + auto c_rotated = (args.layout == Layout::kRowMajor); + auto c_two = (c_rotated) ? args.m : args.n; + return c_two * args.c_ld + args.c_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.b_size = GetSizeB(args); + args.c_size = GetSizeC(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.m; } + static size_t DefaultLDB(const Arguments &args) { return args.n; } + static size_t DefaultLDC(const Arguments &args) { return args.n; } + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hemm(args.layout, args.side, args.triangle, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhemm(static_cast(args.layout), + static_cast(args.side), + static_cast(args.triangle), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, + CommandQueue &queue) { + std::vector result(args.c_size, static_cast(0)); + buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.m; } + static size_t ResultID2(const Arguments &args) { return args.n; } + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { + return (args.layout == Layout::kRowMajor) ? + id1*args.c_ld + id2 + args.c_offset: + id2*args.c_ld + id1 + args.c_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.m * args.n * args.m; + } + static size_t GetBytes(const Arguments &args) { + return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XHEMM_H_ +#endif diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h new file mode 100644 index 00000000..f13e8a62 --- /dev/null +++ b/test/routines/level3/xher2k.h @@ -0,0 +1,132 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xher2k routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XHER2K_H_ +#define CLBLAST_TEST_ROUTINES_XHER2K_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXher2k { + public: + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, kArgK, + kArgLayout, kArgTriangle, kArgATransp, + kArgALeadDim, kArgBLeadDim, kArgCLeadDim, + kArgAOffset, kArgBOffset, kArgCOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments &args) { + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + auto a_two = (a_rotated) ? args.n : args.k; + return a_two * args.a_ld + args.a_offset; + } + static size_t GetSizeB(const Arguments &args) { + auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo); + auto b_two = (b_rotated) ? args.n : args.k; + return b_two * args.b_ld + args.b_offset; + } + static size_t GetSizeC(const Arguments &args) { + return args.n * args.c_ld + args.c_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.b_size = GetSizeB(args); + args.c_size = GetSizeC(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.k; } + static size_t DefaultLDB(const Arguments &args) { return args.k; } + static size_t DefaultLDC(const Arguments &args) { return args.n; } + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto alpha2 = T{args.alpha, args.alpha}; + auto status = Her2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, alpha2, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto alpha2 = T{args.alpha, args.alpha}; + auto status = clblasXher2k(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, alpha2, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, + CommandQueue &queue) { + std::vector result(args.c_size, static_cast(0)); + buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.n; } + static size_t ResultID2(const Arguments &args) { return args.n; } + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { + return id1*args.c_ld + id2 + args.c_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n * args.n * args.k; + } + static size_t GetBytes(const Arguments &args) { + return (args.n*args.k + args.n*args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XHER2K_H_ +#endif diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h new file mode 100644 index 00000000..780b9b52 --- /dev/null +++ b/test/routines/level3/xherk.h @@ -0,0 +1,121 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xherk routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XHERK_H_ +#define CLBLAST_TEST_ROUTINES_XHERK_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXherk { + public: + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, kArgK, + kArgLayout, kArgTriangle, kArgATransp, + kArgALeadDim, kArgCLeadDim, + kArgAOffset, kArgCOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments &args) { + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + auto a_two = (a_rotated) ? args.n : args.k; + return a_two * args.a_ld + args.a_offset; + } + static size_t GetSizeC(const Arguments &args) { + return args.n * args.c_ld + args.c_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.c_size = GetSizeC(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.k; } + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &args) { return args.n; } + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Herk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXherk(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, + CommandQueue &queue) { + std::vector result(args.c_size, static_cast(0)); + buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.n; } + static size_t ResultID2(const Arguments &args) { return args.n; } + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { + return id1*args.c_ld + id2 + args.c_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return args.n * args.n * args.k; + } + static size_t GetBytes(const Arguments &args) { + return (args.n*args.k + args.n*args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XHERK_H_ +#endif diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h new file mode 100644 index 00000000..10476349 --- /dev/null +++ b/test/routines/level3/xsymm.h @@ -0,0 +1,134 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xsymm routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XSYMM_H_ +#define CLBLAST_TEST_ROUTINES_XSYMM_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXsymm { + public: + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgM, kArgN, + kArgLayout, kArgSide, kArgTriangle, + kArgALeadDim, kArgBLeadDim, kArgCLeadDim, + kArgAOffset, kArgBOffset, kArgCOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments &args) { + size_t k_value = (args.side == Side::kLeft) ? args.m : args.n; + auto a_rotated = (args.layout == Layout::kRowMajor); + auto a_two = (a_rotated) ? args.m : k_value; + return a_two * args.a_ld + args.a_offset; + } + static size_t GetSizeB(const Arguments &args) { + size_t k_value = (args.side == Side::kLeft) ? args.m : args.n; + auto b_rotated = (args.layout == Layout::kRowMajor); + auto b_two = (b_rotated) ? k_value : args.n; + return b_two * args.b_ld + args.b_offset; + } + static size_t GetSizeC(const Arguments &args) { + auto c_rotated = (args.layout == Layout::kRowMajor); + auto c_two = (c_rotated) ? args.m : args.n; + return c_two * args.c_ld + args.c_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.b_size = GetSizeB(args); + args.c_size = GetSizeC(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.m; } + static size_t DefaultLDB(const Arguments &args) { return args.n; } + static size_t DefaultLDC(const Arguments &args) { return args.n; } + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Symm(args.layout, args.side, args.triangle, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsymm(static_cast(args.layout), + static_cast(args.side), + static_cast(args.triangle), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, + CommandQueue &queue) { + std::vector result(args.c_size, static_cast(0)); + buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.m; } + static size_t ResultID2(const Arguments &args) { return args.n; } + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { + return (args.layout == Layout::kRowMajor) ? + id1*args.c_ld + id2 + args.c_offset: + id2*args.c_ld + id1 + args.c_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.m * args.n * args.m; + } + static size_t GetBytes(const Arguments &args) { + return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XSYMM_H_ +#endif diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h new file mode 100644 index 00000000..f3b1b542 --- /dev/null +++ b/test/routines/level3/xsyr2k.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xsyr2k routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XSYR2K_H_ +#define CLBLAST_TEST_ROUTINES_XSYR2K_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXsyr2k { + public: + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, kArgK, + kArgLayout, kArgTriangle, kArgATransp, + kArgALeadDim, kArgBLeadDim, kArgCLeadDim, + kArgAOffset, kArgBOffset, kArgCOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments &args) { + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + auto a_two = (a_rotated) ? args.n : args.k; + return a_two * args.a_ld + args.a_offset; + } + static size_t GetSizeB(const Arguments &args) { + auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo); + auto b_two = (b_rotated) ? args.n : args.k; + return b_two * args.b_ld + args.b_offset; + } + static size_t GetSizeC(const Arguments &args) { + return args.n * args.c_ld + args.c_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.b_size = GetSizeB(args); + args.c_size = GetSizeC(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.k; } + static size_t DefaultLDB(const Arguments &args) { return args.k; } + static size_t DefaultLDC(const Arguments &args) { return args.n; } + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syr2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr2k(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, + CommandQueue &queue) { + std::vector result(args.c_size, static_cast(0)); + buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.n; } + static size_t ResultID2(const Arguments &args) { return args.n; } + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { + return id1*args.c_ld + id2 + args.c_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n * args.n * args.k; + } + static size_t GetBytes(const Arguments &args) { + return (args.n*args.k + args.n*args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XSYR2K_H_ +#endif diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h new file mode 100644 index 00000000..2ec9fb65 --- /dev/null +++ b/test/routines/level3/xsyrk.h @@ -0,0 +1,121 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xsyrk routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XSYRK_H_ +#define CLBLAST_TEST_ROUTINES_XSYRK_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXsyrk { + public: + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, kArgK, + kArgLayout, kArgTriangle, kArgATransp, + kArgALeadDim, kArgCLeadDim, + kArgAOffset, kArgCOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments &args) { + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + auto a_two = (a_rotated) ? args.n : args.k; + return a_two * args.a_ld + args.a_offset; + } + static size_t GetSizeC(const Arguments &args) { + return args.n * args.c_ld + args.c_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.c_size = GetSizeC(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.k; } + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &args) { return args.n; } + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syrk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyrk(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, + CommandQueue &queue) { + std::vector result(args.c_size, static_cast(0)); + buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.n; } + static size_t ResultID2(const Arguments &args) { return args.n; } + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { + return id1*args.c_ld + id2 + args.c_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return args.n * args.n * args.k; + } + static size_t GetBytes(const Arguments &args) { + return (args.n*args.k + args.n*args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XSYRK_H_ +#endif diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h new file mode 100644 index 00000000..7b7e7af1 --- /dev/null +++ b/test/routines/level3/xtrmm.h @@ -0,0 +1,127 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xtrmm routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XTRMM_H_ +#define CLBLAST_TEST_ROUTINES_XTRMM_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXtrmm { + public: + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgM, kArgN, + kArgLayout, kArgSide, kArgTriangle, kArgATransp, kArgDiagonal, + kArgALeadDim, kArgBLeadDim, + kArgAOffset, kArgBOffset, + kArgAlpha}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments &args) { + auto k = (args.side == Side::kLeft) ? args.m : args.n; + return k * args.a_ld + args.a_offset; + } + static size_t GetSizeB(const Arguments &args) { + auto b_rotated = (args.layout == Layout::kRowMajor); + auto b_two = (b_rotated) ? args.m : args.n; + return b_two * args.b_ld + args.b_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.b_size = GetSizeB(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.m; } + static size_t DefaultLDB(const Arguments &args) { return args.n; } + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, + CommandQueue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtrmm(static_cast(args.layout), + static_cast(args.side), + static_cast(args.triangle), + static_cast(args.a_transpose), + static_cast(args.diagonal), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, + CommandQueue &queue) { + std::vector result(args.b_size, static_cast(0)); + buffers.b_mat.ReadBuffer(queue, args.b_size*sizeof(T), result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.m; } + static size_t ResultID2(const Arguments &args) { return args.n; } + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { + return (args.layout == Layout::kRowMajor) ? + id1*args.b_ld + id2 + args.b_offset: + id2*args.b_ld + id1 + args.b_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + auto k = (args.side == Side::kLeft) ? args.m : args.n; + return args.m * args.n * k; + } + static size_t GetBytes(const Arguments &args) { + auto k = (args.side == Side::kLeft) ? args.m : args.n; + return (k*k + 2*args.m*args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XTRMM_H_ +#endif diff --git a/test/routines/xaxpy.h b/test/routines/xaxpy.h deleted file mode 100644 index 6ce5d7e2..00000000 --- a/test/routines/xaxpy.h +++ /dev/null @@ -1,113 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements a class with static methods to describe the Xaxpy routine. Examples of -// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These -// static methods are used by the correctness tester and the performance tester. -// -// ================================================================================================= - -#ifndef CLBLAST_TEST_ROUTINES_XAXPY_H_ -#define CLBLAST_TEST_ROUTINES_XAXPY_H_ - -#include -#include - -#include "wrapper_clblas.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TestXaxpy { - public: - - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgN, - kArgXInc, kArgYInc, - kArgXOffset, kArgYOffset, - kArgAlpha}; - } - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { - return args.n * args.x_inc + args.x_offset; - } - static size_t GetSizeY(const Arguments &args) { - return args.n * args.y_inc + args.y_offset; - } - - // Describes how to set the sizes of all the buffers - static void SetSizes(Arguments &args) { - args.x_size = GetSizeX(args); - args.y_size = GetSizeY(args); - } - - // Describes what the default values of the leading dimensions of the matrices are - static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine - static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine - static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine - - // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Axpy(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - clWaitForEvents(1, &event); - return status; - } - - // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXaxpy(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } - - // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { - std::vector result(args.y_size, static_cast(0)); - buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result); - return result; - } - - // Describes how to compute the indices of the result buffer - static size_t ResultID1(const Arguments &args) { return args.n; } - static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine - static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { - return id1*args.y_inc + args.y_offset; - } - - // Describes how to compute performance metrics - static size_t GetFlops(const Arguments &args) { - return 2 * args.n; - } - static size_t GetBytes(const Arguments &args) { - return (3 * args.n) * sizeof(T); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_TEST_ROUTINES_XAXPY_H_ -#endif diff --git a/test/routines/xgemm.h b/test/routines/xgemm.h deleted file mode 100644 index 86a304d1..00000000 --- a/test/routines/xgemm.h +++ /dev/null @@ -1,134 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements a class with static methods to describe the Xgemm routine. Examples of -// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These -// static methods are used by the correctness tester and the performance tester. -// -// ================================================================================================= - -#ifndef CLBLAST_TEST_ROUTINES_XGEMM_H_ -#define CLBLAST_TEST_ROUTINES_XGEMM_H_ - -#include -#include - -#include "wrapper_clblas.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TestXgemm { - public: - - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgM, kArgN, kArgK, - kArgLayout, kArgATransp, kArgBTransp, - kArgALeadDim, kArgBLeadDim, kArgCLeadDim, - kArgAOffset, kArgBOffset, kArgCOffset, - kArgAlpha, kArgBeta}; - } - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeA(const Arguments &args) { - auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || - (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); - auto a_two = (a_rotated) ? args.m : args.k; - return a_two * args.a_ld + args.a_offset; - } - static size_t GetSizeB(const Arguments &args) { - auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) || - (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo); - auto b_two = (b_rotated) ? args.k : args.n; - return b_two * args.b_ld + args.b_offset; - } - static size_t GetSizeC(const Arguments &args) { - auto c_rotated = (args.layout == Layout::kRowMajor); - auto c_two = (c_rotated) ? args.m : args.n; - return c_two * args.c_ld + args.c_offset; - } - - // Describes how to set the sizes of all the buffers - static void SetSizes(Arguments &args) { - args.a_size = GetSizeA(args); - args.b_size = GetSizeB(args); - args.c_size = GetSizeC(args); - } - - // Describes what the default values of the leading dimensions of the matrices are - static size_t DefaultLDA(const Arguments &args) { return args.k; } - static size_t DefaultLDB(const Arguments &args) { return args.n; } - static size_t DefaultLDC(const Arguments &args) { return args.n; } - - // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, - args.m, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - clWaitForEvents(1, &event); - return status; - } - - // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgemm(static_cast(args.layout), - static_cast(args.a_transpose), - static_cast(args.b_transpose), - args.m, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } - - // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { - std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); - return result; - } - - // Describes how to compute the indices of the result buffer - static size_t ResultID1(const Arguments &args) { return args.m; } - static size_t ResultID2(const Arguments &args) { return args.n; } - static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { - return (args.layout == Layout::kRowMajor) ? - id1*args.c_ld + id2 + args.c_offset: - id2*args.c_ld + id1 + args.c_offset; - } - - // Describes how to compute performance metrics - static size_t GetFlops(const Arguments &args) { - return 2 * args.m * args.n * args.m; - } - static size_t GetBytes(const Arguments &args) { - return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_TEST_ROUTINES_XGEMM_H_ -#endif diff --git a/test/routines/xgemv.h b/test/routines/xgemv.h deleted file mode 100644 index 73f7d76e..00000000 --- a/test/routines/xgemv.h +++ /dev/null @@ -1,132 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements a class with static methods to describe the Xgemv routine. Examples of -// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These -// static methods are used by the correctness tester and the performance tester. -// -// ================================================================================================= - -#ifndef CLBLAST_TEST_ROUTINES_XGEMV_H_ -#define CLBLAST_TEST_ROUTINES_XGEMV_H_ - -#include -#include - -#include "wrapper_clblas.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TestXgemv { - public: - - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgM, kArgN, - kArgLayout, kArgATransp, - kArgALeadDim, kArgXInc, kArgYInc, - kArgAOffset, kArgXOffset, kArgYOffset, - kArgAlpha, kArgBeta}; - } - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeA(const Arguments &args) { - auto a_rotated = (args.layout == Layout::kRowMajor); - auto a_two = (a_rotated) ? args.m : args.n; - return a_two * args.a_ld + args.a_offset; - } - static size_t GetSizeX(const Arguments &args) { - auto a_transposed = (args.a_transpose != Transpose::kNo); - auto n_real = (a_transposed) ? args.m : args.n; - return n_real * args.x_inc + args.x_offset; - } - static size_t GetSizeY(const Arguments &args) { - auto a_transposed = (args.a_transpose != Transpose::kNo); - auto m_real = (a_transposed) ? args.n : args.m; - return m_real * args.y_inc + args.y_offset; - } - - // Describes how to set the sizes of all the buffers - static void SetSizes(Arguments &args) { - args.a_size = GetSizeA(args); - args.x_size = GetSizeX(args); - args.y_size = GetSizeY(args); - } - - // Describes what the default values of the leading dimensions of the matrices are - static size_t DefaultLDA(const Arguments &args) { return args.n; } - static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine - static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine - - // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Gemv(args.layout, args.a_transpose, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - clWaitForEvents(1, &event); - return status; - } - - // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgemv(static_cast(args.layout), - static_cast(args.a_transpose), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } - - // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { - std::vector result(args.y_size, static_cast(0)); - buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result); - return result; - } - - // Describes how to compute the indices of the result buffer - static size_t ResultID1(const Arguments &args) { - auto a_transposed = (args.a_transpose != Transpose::kNo); - return (a_transposed) ? args.n : args.m; - } - static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine - static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { - return id1*args.y_inc + args.y_offset; - } - - // Describes how to compute performance metrics - static size_t GetFlops(const Arguments &args) { - return 2 * args.m * args.n; - } - static size_t GetBytes(const Arguments &args) { - return (args.m*args.n + 2*args.m + args.n) * sizeof(T); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_TEST_ROUTINES_XGEMV_H_ -#endif diff --git a/test/routines/xhemm.h b/test/routines/xhemm.h deleted file mode 100644 index 75878b06..00000000 --- a/test/routines/xhemm.h +++ /dev/null @@ -1,134 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements a class with static methods to describe the Xhemm routine. Examples of -// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These -// static methods are used by the correctness tester and the performance tester. -// -// ================================================================================================= - -#ifndef CLBLAST_TEST_ROUTINES_XHEMM_H_ -#define CLBLAST_TEST_ROUTINES_XHEMM_H_ - -#include -#include - -#include "wrapper_clblas.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TestXhemm { - public: - - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgM, kArgN, - kArgLayout, kArgSide, kArgTriangle, - kArgALeadDim, kArgBLeadDim, kArgCLeadDim, - kArgAOffset, kArgBOffset, kArgCOffset, - kArgAlpha, kArgBeta}; - } - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeA(const Arguments &args) { - size_t k_value = (args.side == Side::kLeft) ? args.m : args.n; - auto a_rotated = (args.layout == Layout::kRowMajor); - auto a_two = (a_rotated) ? args.m : k_value; - return a_two * args.a_ld + args.a_offset; - } - static size_t GetSizeB(const Arguments &args) { - size_t k_value = (args.side == Side::kLeft) ? args.m : args.n; - auto b_rotated = (args.layout == Layout::kRowMajor); - auto b_two = (b_rotated) ? k_value : args.n; - return b_two * args.b_ld + args.b_offset; - } - static size_t GetSizeC(const Arguments &args) { - auto c_rotated = (args.layout == Layout::kRowMajor); - auto c_two = (c_rotated) ? args.m : args.n; - return c_two * args.c_ld + args.c_offset; - } - - // Describes how to set the sizes of all the buffers - static void SetSizes(Arguments &args) { - args.a_size = GetSizeA(args); - args.b_size = GetSizeB(args); - args.c_size = GetSizeC(args); - } - - // Describes what the default values of the leading dimensions of the matrices are - static size_t DefaultLDA(const Arguments &args) { return args.m; } - static size_t DefaultLDB(const Arguments &args) { return args.n; } - static size_t DefaultLDC(const Arguments &args) { return args.n; } - - // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hemm(args.layout, args.side, args.triangle, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - clWaitForEvents(1, &event); - return status; - } - - // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhemm(static_cast(args.layout), - static_cast(args.side), - static_cast(args.triangle), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } - - // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { - std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); - return result; - } - - // Describes how to compute the indices of the result buffer - static size_t ResultID1(const Arguments &args) { return args.m; } - static size_t ResultID2(const Arguments &args) { return args.n; } - static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { - return (args.layout == Layout::kRowMajor) ? - id1*args.c_ld + id2 + args.c_offset: - id2*args.c_ld + id1 + args.c_offset; - } - - // Describes how to compute performance metrics - static size_t GetFlops(const Arguments &args) { - return 2 * args.m * args.n * args.m; - } - static size_t GetBytes(const Arguments &args) { - return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_TEST_ROUTINES_XHEMM_H_ -#endif diff --git a/test/routines/xher2k.h b/test/routines/xher2k.h deleted file mode 100644 index f13e8a62..00000000 --- a/test/routines/xher2k.h +++ /dev/null @@ -1,132 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements a class with static methods to describe the Xher2k routine. Examples of -// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These -// static methods are used by the correctness tester and the performance tester. -// -// ================================================================================================= - -#ifndef CLBLAST_TEST_ROUTINES_XHER2K_H_ -#define CLBLAST_TEST_ROUTINES_XHER2K_H_ - -#include -#include - -#include "wrapper_clblas.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TestXher2k { - public: - - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgN, kArgK, - kArgLayout, kArgTriangle, kArgATransp, - kArgALeadDim, kArgBLeadDim, kArgCLeadDim, - kArgAOffset, kArgBOffset, kArgCOffset, - kArgAlpha, kArgBeta}; - } - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeA(const Arguments &args) { - auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || - (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); - auto a_two = (a_rotated) ? args.n : args.k; - return a_two * args.a_ld + args.a_offset; - } - static size_t GetSizeB(const Arguments &args) { - auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) || - (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo); - auto b_two = (b_rotated) ? args.n : args.k; - return b_two * args.b_ld + args.b_offset; - } - static size_t GetSizeC(const Arguments &args) { - return args.n * args.c_ld + args.c_offset; - } - - // Describes how to set the sizes of all the buffers - static void SetSizes(Arguments &args) { - args.a_size = GetSizeA(args); - args.b_size = GetSizeB(args); - args.c_size = GetSizeC(args); - } - - // Describes what the default values of the leading dimensions of the matrices are - static size_t DefaultLDA(const Arguments &args) { return args.k; } - static size_t DefaultLDB(const Arguments &args) { return args.k; } - static size_t DefaultLDC(const Arguments &args) { return args.n; } - - // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto alpha2 = T{args.alpha, args.alpha}; - auto status = Her2k(args.layout, args.triangle, args.a_transpose, - args.n, args.k, alpha2, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - clWaitForEvents(1, &event); - return status; - } - - // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto alpha2 = T{args.alpha, args.alpha}; - auto status = clblasXher2k(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - args.n, args.k, alpha2, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } - - // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { - std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); - return result; - } - - // Describes how to compute the indices of the result buffer - static size_t ResultID1(const Arguments &args) { return args.n; } - static size_t ResultID2(const Arguments &args) { return args.n; } - static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { - return id1*args.c_ld + id2 + args.c_offset; - } - - // Describes how to compute performance metrics - static size_t GetFlops(const Arguments &args) { - return 2 * args.n * args.n * args.k; - } - static size_t GetBytes(const Arguments &args) { - return (args.n*args.k + args.n*args.n) * sizeof(T); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_TEST_ROUTINES_XHER2K_H_ -#endif diff --git a/test/routines/xherk.h b/test/routines/xherk.h deleted file mode 100644 index 780b9b52..00000000 --- a/test/routines/xherk.h +++ /dev/null @@ -1,121 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements a class with static methods to describe the Xherk routine. Examples of -// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These -// static methods are used by the correctness tester and the performance tester. -// -// ================================================================================================= - -#ifndef CLBLAST_TEST_ROUTINES_XHERK_H_ -#define CLBLAST_TEST_ROUTINES_XHERK_H_ - -#include -#include - -#include "wrapper_clblas.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TestXherk { - public: - - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgN, kArgK, - kArgLayout, kArgTriangle, kArgATransp, - kArgALeadDim, kArgCLeadDim, - kArgAOffset, kArgCOffset, - kArgAlpha, kArgBeta}; - } - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeA(const Arguments &args) { - auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || - (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); - auto a_two = (a_rotated) ? args.n : args.k; - return a_two * args.a_ld + args.a_offset; - } - static size_t GetSizeC(const Arguments &args) { - return args.n * args.c_ld + args.c_offset; - } - - // Describes how to set the sizes of all the buffers - static void SetSizes(Arguments &args) { - args.a_size = GetSizeA(args); - args.c_size = GetSizeC(args); - } - - // Describes what the default values of the leading dimensions of the matrices are - static size_t DefaultLDA(const Arguments &args) { return args.k; } - static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine - static size_t DefaultLDC(const Arguments &args) { return args.n; } - - // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Herk(args.layout, args.triangle, args.a_transpose, - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - clWaitForEvents(1, &event); - return status; - } - - // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXherk(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } - - // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { - std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); - return result; - } - - // Describes how to compute the indices of the result buffer - static size_t ResultID1(const Arguments &args) { return args.n; } - static size_t ResultID2(const Arguments &args) { return args.n; } - static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { - return id1*args.c_ld + id2 + args.c_offset; - } - - // Describes how to compute performance metrics - static size_t GetFlops(const Arguments &args) { - return args.n * args.n * args.k; - } - static size_t GetBytes(const Arguments &args) { - return (args.n*args.k + args.n*args.n) * sizeof(T); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_TEST_ROUTINES_XHERK_H_ -#endif diff --git a/test/routines/xsymm.h b/test/routines/xsymm.h deleted file mode 100644 index 10476349..00000000 --- a/test/routines/xsymm.h +++ /dev/null @@ -1,134 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements a class with static methods to describe the Xsymm routine. Examples of -// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These -// static methods are used by the correctness tester and the performance tester. -// -// ================================================================================================= - -#ifndef CLBLAST_TEST_ROUTINES_XSYMM_H_ -#define CLBLAST_TEST_ROUTINES_XSYMM_H_ - -#include -#include - -#include "wrapper_clblas.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TestXsymm { - public: - - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgM, kArgN, - kArgLayout, kArgSide, kArgTriangle, - kArgALeadDim, kArgBLeadDim, kArgCLeadDim, - kArgAOffset, kArgBOffset, kArgCOffset, - kArgAlpha, kArgBeta}; - } - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeA(const Arguments &args) { - size_t k_value = (args.side == Side::kLeft) ? args.m : args.n; - auto a_rotated = (args.layout == Layout::kRowMajor); - auto a_two = (a_rotated) ? args.m : k_value; - return a_two * args.a_ld + args.a_offset; - } - static size_t GetSizeB(const Arguments &args) { - size_t k_value = (args.side == Side::kLeft) ? args.m : args.n; - auto b_rotated = (args.layout == Layout::kRowMajor); - auto b_two = (b_rotated) ? k_value : args.n; - return b_two * args.b_ld + args.b_offset; - } - static size_t GetSizeC(const Arguments &args) { - auto c_rotated = (args.layout == Layout::kRowMajor); - auto c_two = (c_rotated) ? args.m : args.n; - return c_two * args.c_ld + args.c_offset; - } - - // Describes how to set the sizes of all the buffers - static void SetSizes(Arguments &args) { - args.a_size = GetSizeA(args); - args.b_size = GetSizeB(args); - args.c_size = GetSizeC(args); - } - - // Describes what the default values of the leading dimensions of the matrices are - static size_t DefaultLDA(const Arguments &args) { return args.m; } - static size_t DefaultLDB(const Arguments &args) { return args.n; } - static size_t DefaultLDC(const Arguments &args) { return args.n; } - - // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Symm(args.layout, args.side, args.triangle, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - clWaitForEvents(1, &event); - return status; - } - - // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsymm(static_cast(args.layout), - static_cast(args.side), - static_cast(args.triangle), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } - - // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { - std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); - return result; - } - - // Describes how to compute the indices of the result buffer - static size_t ResultID1(const Arguments &args) { return args.m; } - static size_t ResultID2(const Arguments &args) { return args.n; } - static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { - return (args.layout == Layout::kRowMajor) ? - id1*args.c_ld + id2 + args.c_offset: - id2*args.c_ld + id1 + args.c_offset; - } - - // Describes how to compute performance metrics - static size_t GetFlops(const Arguments &args) { - return 2 * args.m * args.n * args.m; - } - static size_t GetBytes(const Arguments &args) { - return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_TEST_ROUTINES_XSYMM_H_ -#endif diff --git a/test/routines/xsyr2k.h b/test/routines/xsyr2k.h deleted file mode 100644 index f3b1b542..00000000 --- a/test/routines/xsyr2k.h +++ /dev/null @@ -1,130 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements a class with static methods to describe the Xsyr2k routine. Examples of -// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These -// static methods are used by the correctness tester and the performance tester. -// -// ================================================================================================= - -#ifndef CLBLAST_TEST_ROUTINES_XSYR2K_H_ -#define CLBLAST_TEST_ROUTINES_XSYR2K_H_ - -#include -#include - -#include "wrapper_clblas.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TestXsyr2k { - public: - - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgN, kArgK, - kArgLayout, kArgTriangle, kArgATransp, - kArgALeadDim, kArgBLeadDim, kArgCLeadDim, - kArgAOffset, kArgBOffset, kArgCOffset, - kArgAlpha, kArgBeta}; - } - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeA(const Arguments &args) { - auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || - (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); - auto a_two = (a_rotated) ? args.n : args.k; - return a_two * args.a_ld + args.a_offset; - } - static size_t GetSizeB(const Arguments &args) { - auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) || - (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo); - auto b_two = (b_rotated) ? args.n : args.k; - return b_two * args.b_ld + args.b_offset; - } - static size_t GetSizeC(const Arguments &args) { - return args.n * args.c_ld + args.c_offset; - } - - // Describes how to set the sizes of all the buffers - static void SetSizes(Arguments &args) { - args.a_size = GetSizeA(args); - args.b_size = GetSizeB(args); - args.c_size = GetSizeC(args); - } - - // Describes what the default values of the leading dimensions of the matrices are - static size_t DefaultLDA(const Arguments &args) { return args.k; } - static size_t DefaultLDB(const Arguments &args) { return args.k; } - static size_t DefaultLDC(const Arguments &args) { return args.n; } - - // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Syr2k(args.layout, args.triangle, args.a_transpose, - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - clWaitForEvents(1, &event); - return status; - } - - // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyr2k(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } - - // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { - std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); - return result; - } - - // Describes how to compute the indices of the result buffer - static size_t ResultID1(const Arguments &args) { return args.n; } - static size_t ResultID2(const Arguments &args) { return args.n; } - static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { - return id1*args.c_ld + id2 + args.c_offset; - } - - // Describes how to compute performance metrics - static size_t GetFlops(const Arguments &args) { - return 2 * args.n * args.n * args.k; - } - static size_t GetBytes(const Arguments &args) { - return (args.n*args.k + args.n*args.n) * sizeof(T); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_TEST_ROUTINES_XSYR2K_H_ -#endif diff --git a/test/routines/xsyrk.h b/test/routines/xsyrk.h deleted file mode 100644 index 2ec9fb65..00000000 --- a/test/routines/xsyrk.h +++ /dev/null @@ -1,121 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements a class with static methods to describe the Xsyrk routine. Examples of -// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These -// static methods are used by the correctness tester and the performance tester. -// -// ================================================================================================= - -#ifndef CLBLAST_TEST_ROUTINES_XSYRK_H_ -#define CLBLAST_TEST_ROUTINES_XSYRK_H_ - -#include -#include - -#include "wrapper_clblas.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TestXsyrk { - public: - - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgN, kArgK, - kArgLayout, kArgTriangle, kArgATransp, - kArgALeadDim, kArgCLeadDim, - kArgAOffset, kArgCOffset, - kArgAlpha, kArgBeta}; - } - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeA(const Arguments &args) { - auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || - (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); - auto a_two = (a_rotated) ? args.n : args.k; - return a_two * args.a_ld + args.a_offset; - } - static size_t GetSizeC(const Arguments &args) { - return args.n * args.c_ld + args.c_offset; - } - - // Describes how to set the sizes of all the buffers - static void SetSizes(Arguments &args) { - args.a_size = GetSizeA(args); - args.c_size = GetSizeC(args); - } - - // Describes what the default values of the leading dimensions of the matrices are - static size_t DefaultLDA(const Arguments &args) { return args.k; } - static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine - static size_t DefaultLDC(const Arguments &args) { return args.n; } - - // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Syrk(args.layout, args.triangle, args.a_transpose, - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - clWaitForEvents(1, &event); - return status; - } - - // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyrk(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } - - // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { - std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); - return result; - } - - // Describes how to compute the indices of the result buffer - static size_t ResultID1(const Arguments &args) { return args.n; } - static size_t ResultID2(const Arguments &args) { return args.n; } - static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { - return id1*args.c_ld + id2 + args.c_offset; - } - - // Describes how to compute performance metrics - static size_t GetFlops(const Arguments &args) { - return args.n * args.n * args.k; - } - static size_t GetBytes(const Arguments &args) { - return (args.n*args.k + args.n*args.n) * sizeof(T); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_TEST_ROUTINES_XSYRK_H_ -#endif diff --git a/test/routines/xtrmm.h b/test/routines/xtrmm.h deleted file mode 100644 index 7b7e7af1..00000000 --- a/test/routines/xtrmm.h +++ /dev/null @@ -1,127 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements a class with static methods to describe the Xtrmm routine. Examples of -// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These -// static methods are used by the correctness tester and the performance tester. -// -// ================================================================================================= - -#ifndef CLBLAST_TEST_ROUTINES_XTRMM_H_ -#define CLBLAST_TEST_ROUTINES_XTRMM_H_ - -#include -#include - -#include "wrapper_clblas.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TestXtrmm { - public: - - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgM, kArgN, - kArgLayout, kArgSide, kArgTriangle, kArgATransp, kArgDiagonal, - kArgALeadDim, kArgBLeadDim, - kArgAOffset, kArgBOffset, - kArgAlpha}; - } - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeA(const Arguments &args) { - auto k = (args.side == Side::kLeft) ? args.m : args.n; - return k * args.a_ld + args.a_offset; - } - static size_t GetSizeB(const Arguments &args) { - auto b_rotated = (args.layout == Layout::kRowMajor); - auto b_two = (b_rotated) ? args.m : args.n; - return b_two * args.b_ld + args.b_offset; - } - - // Describes how to set the sizes of all the buffers - static void SetSizes(Arguments &args) { - args.a_size = GetSizeA(args); - args.b_size = GetSizeB(args); - } - - // Describes what the default values of the leading dimensions of the matrices are - static size_t DefaultLDA(const Arguments &args) { return args.m; } - static size_t DefaultLDB(const Arguments &args) { return args.n; } - static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine - - // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, - &queue_plain, &event); - clWaitForEvents(1, &event); - return status; - } - - // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtrmm(static_cast(args.layout), - static_cast(args.side), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } - - // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { - std::vector result(args.b_size, static_cast(0)); - buffers.b_mat.ReadBuffer(queue, args.b_size*sizeof(T), result); - return result; - } - - // Describes how to compute the indices of the result buffer - static size_t ResultID1(const Arguments &args) { return args.m; } - static size_t ResultID2(const Arguments &args) { return args.n; } - static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { - return (args.layout == Layout::kRowMajor) ? - id1*args.b_ld + id2 + args.b_offset: - id2*args.b_ld + id1 + args.b_offset; - } - - // Describes how to compute performance metrics - static size_t GetFlops(const Arguments &args) { - auto k = (args.side == Side::kLeft) ? args.m : args.n; - return args.m * args.n * k; - } - static size_t GetBytes(const Arguments &args) { - auto k = (args.side == Side::kLeft) ? args.m : args.n; - return (k*k + 2*args.m*args.n) * sizeof(T); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_TEST_ROUTINES_XTRMM_H_ -#endif -- cgit v1.2.3