diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-06-16 18:07:46 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-06-16 18:07:46 +0200 |
commit | 52ccaf5b25e14c9ce032315e5e96b1f27886d481 (patch) | |
tree | 087288b7aebf2a06ffc4e7dcbcd4353f7a3be6a7 | |
parent | 39b7dbc5e37829abfbcfb77852b9138b31540b42 (diff) |
Added XOMATCOPY routines to perform out-of-place matrix scaling, copying, and/or transposing
33 files changed, 889 insertions, 64 deletions
@@ -11,6 +11,8 @@ Development version (next release) * Level-1: HSWAP/HSCAL/HCOPY/HAXPY/HDOT/HNRM2/HASUM/HSUM/iHAMAX/iHMAX/iHMIN * Level-2: HGEMV/HGBMV/HHEMV/HHBMV/HHPMV/HSYMV/HSBMV/HSPMV/HTRMV/HTBMV/HTPMV/HGER/HSYR/HSPR/HSYR2/HSPR2 * Level-3: HGEMM/HSYMM/HSYRK/HSYR2K/HTRMM +- Added non-BLAS routines: + * SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY (matrix copy, scaling, and/or transpose) Version 0.7.1 - Improved performance of large power-of-2 xGEMM kernels for AMD GPUs diff --git a/CMakeLists.txt b/CMakeLists.txt index 66547cc0..21982f39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -133,7 +133,8 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm) -set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES}) +set(LEVELX_ROUTINES xomatcopy) +set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES}) set(PRECISIONS 32 64 3232 6464) # ================================================================================================== @@ -150,6 +151,9 @@ endforeach() foreach(ROUTINE ${LEVEL3_ROUTINES}) set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc) endforeach() +foreach(ROUTINE ${LEVELX_ROUTINES}) + set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cc) +endforeach() # Creates and links the library add_library(clblast SHARED ${SOURCES}) @@ -279,6 +283,10 @@ if(CLIENTS) add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common> test/performance/routines/level3/${ROUTINE}.cc) endforeach() + foreach(ROUTINE ${LEVELX_ROUTINES}) + add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common> + test/performance/routines/levelx/${ROUTINE}.cc) + endforeach() foreach(ROUTINE ${ROUTINES}) target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) install(TARGETS clblast_client_${ROUTINE} DESTINATION bin) @@ -310,6 +318,10 @@ if(TESTS) add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common> test/correctness/routines/level3/${ROUTINE}.cc) endforeach() + foreach(ROUTINE ${LEVELX_ROUTINES}) + add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common> + test/correctness/routines/levelx/${ROUTINE}.cc) + endforeach() foreach(ROUTINE ${ROUTINES}) target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) install(TARGETS clblast_test_${ROUTINE} DESTINATION bin) @@ -235,13 +235,14 @@ CLBlast is in active development but already supports almost all the BLAS routin | xHER2K | - | - | ✔ | ✔ | - | | xTRMM | ✔ | ✔ | ✔ | ✔ | ✔ | -In addition, some non-BLAS routines are also supported by CLBlast. They are experimental and should be used with care: +In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care: -| Additional | S | D | C | Z | H | +| Level-X | S | D | C | Z | H | | -----------|---|---|---|---|---| | xSUM | ✔ | ✔ | ✔ | ✔ | ✔ | | IxMAX | ✔ | ✔ | ✔ | ✔ | ✔ | | IxMIN | ✔ | ✔ | ✔ | ✔ | ✔ | +| xOMATCOPY | ✔ | ✔ | ✔ | ✔ | ✔ | Some BLAS routines are not supported yet by CLBlast. They are shown in the following table: diff --git a/doc/clblast.md b/doc/clblast.md index 5f337dd3..5105d023 100644 --- a/doc/clblast.md +++ b/doc/clblast.md @@ -2708,3 +2708,76 @@ Requirements for TRMM: +xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function) +------------- + +Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition. + +C++ API: +``` +template <typename T> +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to OMATCOPY: + +* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const size_t m`: Integer size argument. This value must be positive. +* `const size_t n`: Integer size argument. This value must be positive. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. +* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix. +* `const size_t b_offset`: The offset in elements from the start of the output B matrix. +* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + +Requirements for OMATCOPY: + +* The value of `a_ld` must be at least `m`. +* The value of `b_ld` must be at least `n`. + + + diff --git a/include/clblast.h b/include/clblast.h index 64b2610a..31a07423 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -560,6 +560,19 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c cl_command_queue* queue, cl_event* event = nullptr); // ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +template <typename T> +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// ================================================================================================= // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on // for the same device. This cache can be cleared to free up system memory or in case of debugging. diff --git a/include/clblast_c.h b/include/clblast_c.h index 40248615..3ac6d99c 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -1266,6 +1266,42 @@ StatusCode PUBLIC_API CLBlastHtrsm(const Layout layout, const Side side, const T cl_command_queue* queue, cl_event* event); // ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +StatusCode PUBLIC_API CLBlastSomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastComatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); + +// ================================================================================================= // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on // for the same device. This cache can be cleared to free up system memory or in case of debugging. diff --git a/include/internal/routine.h b/include/internal/routine.h index d420e2db..35837575 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -83,6 +83,7 @@ class Routine { const size_t dest_one, const size_t dest_two, const size_t dest_ld, const size_t dest_offset, const Buffer<T> &dest, + const T alpha, const Program &program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const bool upper = false, const bool lower = false, diff --git a/include/internal/routines/levelx/xomatcopy.h b/include/internal/routines/levelx/xomatcopy.h new file mode 100644 index 00000000..38df846e --- /dev/null +++ b/include/internal/routines/levelx/xomatcopy.h @@ -0,0 +1,54 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xomatcopy routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XOMATCOPY_H_ +#define CLBLAST_ROUTINES_XOMATCOPY_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xomatcopy: public Routine<T> { + public: + + // Members and methods from the base class + using Routine<T>::source_string_; + using Routine<T>::event_; + using Routine<T>::GetProgramFromCache; + using Routine<T>::PadCopyTransposeMatrix; + using Routine<T>::TestMatrixA; + using Routine<T>::TestMatrixB; + using Routine<T>::ErrorIn; + + // Constructor + Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY"); + + // Templated-precision implementation of the routine + StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XOMATCOPY_H_ +#endif diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 7bb66749..6726adda 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -68,6 +68,7 @@ ald_transa_m_k = "When `transpose_a == Transpose::kNo`, then `a_ld` must be at l ald_trans_n_k = "When `transpose == Transpose::kNo`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`." ald_side_m_n = "When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`." bld_m = "The value of `b_ld` must be at least `m`." +bld_n = "The value of `b_ld` must be at least `n`." bld_transb_k_n = "When `transpose_b == Transpose::kNo`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`." bld_trans_n_k = "When `transpose == Transpose::kNo`, then `b_ld` must be at least `n`, otherwise `b_ld` must be at least `k`." cld_m = "The value of `c_ld` must be at least `m`." @@ -134,6 +135,9 @@ routines = [ Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []), +], +[ # Level X: extra routines (not part of BLAS) + Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), ]] # ================================================================================================== @@ -148,6 +152,7 @@ def PrecisionToFullName(x): }[x] # ================================================================================================== + # Separators for the BLAS levels separators = [""" // ================================================================================================= @@ -160,8 +165,15 @@ separators = [""" """ // ================================================================================================= // BLAS level-3 (matrix-matrix) routines +// =================================================================================================""", +""" +// ================================================================================================= +// Extra non-BLAS routines (level-X) // ================================================================================================="""] +# Names of the level sub-folders +levelnames = ["1", "2", "3", "x"] + # Main header/footer for source files header = """ // ================================================================================================= @@ -373,7 +385,7 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 71, 93, 22, 29, 41] +header_lines = [84, 74, 93, 22, 29, 41] footer_lines = [17, 71, 19, 14, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise @@ -396,7 +408,8 @@ for i in xrange(0,len(files)): # Re-writes the body of the file with open(files[i], "w") as f: body = "" - for level in [1,2,3]: + levels = [1,2,3] if (i == 4 or i == 5) else [1,2,3,4] + for level in levels: body += separators[level-1]+"\n" if i == 0: body += clblast_h(routines[level-1]) @@ -417,14 +430,14 @@ for i in xrange(0,len(files)): # ================================================================================================== # Outputs all the correctness-test implementations -for level in [1,2,3]: +for level in [1,2,3,4]: for routine in routines[level-1]: if routine.has_tests: - filename = path_clblast+"/test/correctness/routines/level"+str(level)+"/x"+routine.name+".cc" + filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cc" with open(filename, "w") as f: body = "" body += "#include \"correctness/testblas.h\"\n" - body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n" + body += "#include \"routines/level"+levelnames[level-1]+"/x"+routine.name+".h\"\n\n" body += "// Shortcuts to the clblast namespace\n" body += "using float2 = clblast::float2;\n" body += "using double2 = clblast::double2;\n\n" @@ -443,14 +456,14 @@ for level in [1,2,3]: f.write(footer) # Outputs all the performance-test implementations -for level in [1,2,3]: +for level in [1,2,3,4]: for routine in routines[level-1]: if routine.has_tests: - filename = path_clblast+"/test/performance/routines/level"+str(level)+"/x"+routine.name+".cc" + filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cc" with open(filename, "w") as f: body = "" body += "#include \"performance/client.h\"\n" - body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n" + body += "#include \"routines/level"+levelnames[level-1]+"/x"+routine.name+".h\"\n\n" body += "// Shortcuts to the clblast namespace\n" body += "using float2 = clblast::float2;\n" body += "using double2 = clblast::double2;\n\n" @@ -487,7 +500,7 @@ with open(filename, "w") as f: f.write("\n\n") # Loops over the routines - for level in [1,2,3]: + for level in [1,2,3,4]: for routine in routines[level-1]: if routine.implemented: diff --git a/src/clblast.cc b/src/clblast.cc index 07322327..e3df6ede 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -68,6 +68,9 @@ #include "internal/routines/level3/xher2k.h" #include "internal/routines/level3/xtrmm.h" +// Extra includes (level-x) +#include "internal/routines/levelx/xomatcopy.h" + namespace clblast { // ================================================================================================= @@ -2062,6 +2065,59 @@ template StatusCode PUBLIC_API Trsm<half>(const Layout, const Side, const Triang cl_command_queue*, cl_event*); // ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +template <typename T> +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xomatcopy<T>(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoOmatcopy(layout, a_transpose, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld); +} +template StatusCode PUBLIC_API Omatcopy<float>(const Layout, const Transpose, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Omatcopy<double>(const Layout, const Transpose, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Omatcopy<float2>(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Omatcopy<double2>(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// ================================================================================================= // Clears the cache of stored binaries StatusCode ClearCache() { return cache::ClearCache(); } diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 2aac907a..22cb2192 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -2832,6 +2832,87 @@ StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle tri } // ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// OMATCOPY +StatusCode CLBlastSomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast<StatusCode>(status); +} +StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout), + static_cast<clblast::Transpose>(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast<StatusCode>(status); +} + +// ================================================================================================= // Clears the cache of stored binaries StatusCode CLBlastClearCache() { diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl index bfbfacd4..09e54e6d 100644 --- a/src/kernels/level3/copy_fast.opencl +++ b/src/kernels/level3/copy_fast.opencl @@ -38,13 +38,53 @@ R"( __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) __kernel void CopyMatrixFast(const int ld, __global const realC* restrict src, - __global realC* dest) { + __global realC* dest, + const __constant real* restrict arg_alpha) { + const real alpha = arg_alpha[0]; #pragma unroll for (int w_one=0; w_one<COPY_WPT; ++w_one) { const int id_one = get_global_id(0); const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1); const int id = id_two*(ld/COPY_VW) + id_one; - dest[id] = src[id]; + realC result; + #if COPY_VW == 1 + Multiply(result, alpha, src[id]); + #elif COPY_VW == 2 + Multiply(result.x, alpha, src[id].x); + Multiply(result.y, alpha, src[id].y); + #elif COPY_VW == 4 + Multiply(result.x, alpha, src[id].x); + Multiply(result.y, alpha, src[id].y); + Multiply(result.z, alpha, src[id].z); + Multiply(result.w, alpha, src[id].w); + #elif COPY_VW == 8 + Multiply(result.s0, alpha, src[id].s0); + Multiply(result.s1, alpha, src[id].s1); + Multiply(result.s2, alpha, src[id].s2); + Multiply(result.s3, alpha, src[id].s3); + Multiply(result.s4, alpha, src[id].s4); + Multiply(result.s5, alpha, src[id].s5); + Multiply(result.s6, alpha, src[id].s6); + Multiply(result.s7, alpha, src[id].s7); + #elif COPY_VW == 16 + Multiply(result.s0, alpha, src[id].s0); + Multiply(result.s1, alpha, src[id].s1); + Multiply(result.s2, alpha, src[id].s2); + Multiply(result.s3, alpha, src[id].s3); + Multiply(result.s4, alpha, src[id].s4); + Multiply(result.s5, alpha, src[id].s5); + Multiply(result.s6, alpha, src[id].s6); + Multiply(result.s7, alpha, src[id].s7); + Multiply(result.s8, alpha, src[id].s8); + Multiply(result.s9, alpha, src[id].s9); + Multiply(result.sA, alpha, src[id].sA); + Multiply(result.sB, alpha, src[id].sB); + Multiply(result.sC, alpha, src[id].sC); + Multiply(result.sD, alpha, src[id].sD); + Multiply(result.sE, alpha, src[id].sE); + Multiply(result.sF, alpha, src[id].sF); + #endif + dest[id] = result;; } } diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl index f211af0f..d276cc60 100644 --- a/src/kernels/level3/copy_pad.opencl +++ b/src/kernels/level3/copy_pad.opencl @@ -31,7 +31,9 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, __global real* dest, + const __constant real* restrict arg_alpha, const int do_conjugate) { + const real alpha = arg_alpha[0]; // Loops over the work per thread in both dimensions #pragma unroll @@ -52,7 +54,7 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two, // Stores the value in the destination matrix if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } - dest[id_two*dest_ld + id_one + dest_offset] = value; + Multiply(dest[id_two*dest_ld + id_one + dest_offset], alpha, value); } } } @@ -70,8 +72,10 @@ __kernel void CopyMatrix(const int src_one, const int src_two, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, __global real* dest, + const __constant real* restrict arg_alpha, const int upper, const int lower, const int diagonal_imag_zero) { + const real alpha = arg_alpha[0]; // Loops over the work per thread in both dimensions #pragma unroll @@ -94,7 +98,7 @@ __kernel void CopyMatrix(const int src_one, const int src_two, if (id_two < dest_two && id_one < dest_one) { real value = src[id_two*src_ld + id_one + src_offset]; if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); } - dest[id_two*dest_ld + id_one + dest_offset] = value; + Multiply(dest[id_two*dest_ld + id_one + dest_offset], alpha, value); } } } diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl index 08266461..d5c46a30 100644 --- a/src/kernels/level3/transpose_fast.opencl +++ b/src/kernels/level3/transpose_fast.opencl @@ -39,7 +39,9 @@ R"( __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) __kernel void TransposeMatrixFast(const int ld, __global const realT* restrict src, - __global realT* dest) { + __global realT* dest, + const __constant real* restrict arg_alpha) { + const real alpha = arg_alpha[0]; // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different // way over workgroups, breaking memory-bank dependencies. @@ -117,12 +119,50 @@ __kernel void TransposeMatrixFast(const int ld, results[15] = (realT) {v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF}; #endif - // Stores the results into the destination matrix + // Multiplies by alpha and then stores the results into the destination matrix #pragma unroll for (int w_two=0; w_two<TRA_WPT; ++w_two) { + realT result; + #if TRA_WPT == 1 + Multiply(result, alpha, results[w_two]); + #elif TRA_WPT == 2 + Multiply(result.x, alpha, results[w_two].x); + Multiply(result.y, alpha, results[w_two].y); + #elif TRA_WPT == 4 + Multiply(result.x, alpha, results[w_two].x); + Multiply(result.y, alpha, results[w_two].y); + Multiply(result.z, alpha, results[w_two].z); + Multiply(result.w, alpha, results[w_two].w); + #elif TRA_WPT == 8 + Multiply(result.s0, alpha, results[w_two].s0); + Multiply(result.s1, alpha, results[w_two].s1); + Multiply(result.s2, alpha, results[w_two].s2); + Multiply(result.s3, alpha, results[w_two].s3); + Multiply(result.s4, alpha, results[w_two].s4); + Multiply(result.s5, alpha, results[w_two].s5); + Multiply(result.s6, alpha, results[w_two].s6); + Multiply(result.s7, alpha, results[w_two].s7); + #elif TRA_WPT == 16 + Multiply(result.s0, alpha, results[w_two].s0); + Multiply(result.s1, alpha, results[w_two].s1); + Multiply(result.s2, alpha, results[w_two].s2); + Multiply(result.s3, alpha, results[w_two].s3); + Multiply(result.s4, alpha, results[w_two].s4); + Multiply(result.s5, alpha, results[w_two].s5); + Multiply(result.s6, alpha, results[w_two].s6); + Multiply(result.s7, alpha, results[w_two].s7); + Multiply(result.s8, alpha, results[w_two].s8); + Multiply(result.s9, alpha, results[w_two].s9); + Multiply(result.sA, alpha, results[w_two].sA); + Multiply(result.sB, alpha, results[w_two].sB); + Multiply(result.sC, alpha, results[w_two].sC); + Multiply(result.sD, alpha, results[w_two].sD); + Multiply(result.sE, alpha, results[w_two].sE); + Multiply(result.sF, alpha, results[w_two].sF); + #endif const int id_one = gid0*TRA_DIM + get_local_id(0); const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two; - dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two]; + dest[id_two*(ld/TRA_WPT) + id_one] = result; } } diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index 38c23346..2de0c7bd 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -31,7 +31,9 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, __global real* dest, + const __constant real* restrict arg_alpha, const int do_conjugate) { + const real alpha = arg_alpha[0]; // Local memory to store a tile of the matrix (for coalescing) __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; @@ -75,7 +77,7 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two, if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } - dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value); } } } @@ -93,8 +95,10 @@ __kernel void TransposeMatrix(const int src_one, const int src_two, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, __global real* dest, + const __constant real* restrict arg_alpha, const int upper, const int lower, const int diagonal_imag_zero) { + const real alpha = arg_alpha[0]; // Local memory to store a tile of the matrix (for coalescing) __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; @@ -143,7 +147,7 @@ __kernel void TransposeMatrix(const int src_one, const int src_two, if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); } - dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value); } } } diff --git a/src/routine.cc b/src/routine.cc index 4b334e60..1cf8bff8 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -302,6 +302,7 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev const size_t dest_one, const size_t dest_two, const size_t dest_ld, const size_t dest_offset, const Buffer<T> &dest, + const T alpha, const Program &program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const bool upper, const bool lower, @@ -339,6 +340,10 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev } } + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer<T>(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + // Retrieves the kernel from the compiled binary try { auto kernel = Kernel(program, kernel_name); @@ -348,6 +353,7 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev kernel.SetArgument(0, static_cast<int>(src_ld)); kernel.SetArgument(1, src()); kernel.SetArgument(2, dest()); + kernel.SetArgument(3, alpha_buffer()); } else { kernel.SetArgument(0, static_cast<int>(src_one)); @@ -360,13 +366,14 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev kernel.SetArgument(7, static_cast<int>(dest_ld)); kernel.SetArgument(8, static_cast<int>(dest_offset)); kernel.SetArgument(9, dest()); + kernel.SetArgument(10, alpha_buffer()); if (do_pad) { - kernel.SetArgument(10, static_cast<int>(do_conjugate)); + kernel.SetArgument(11, static_cast<int>(do_conjugate)); } else { - kernel.SetArgument(10, static_cast<int>(upper)); - kernel.SetArgument(11, static_cast<int>(lower)); - kernel.SetArgument(12, static_cast<int>(diagonal_imag_zero)); + kernel.SetArgument(11, static_cast<int>(upper)); + kernel.SetArgument(12, static_cast<int>(lower)); + kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero)); } } diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 6fa6a811..42d5f19e 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -145,7 +145,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, m_ceiled, k_ceiled, m_ceiled, 0, a_temp, - program, true, a_do_transpose, a_conjugate); + ConstantOne<T>(), program, + true, a_do_transpose, a_conjugate); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventProcessA); } @@ -156,7 +157,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - program, true, b_do_transpose, b_conjugate); + ConstantOne<T>(), program, + true, b_do_transpose, b_conjugate); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventProcessB); } @@ -167,7 +169,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, c_one, c_two, c_ld, c_offset, c_buffer, m_ceiled, n_ceiled, m_ceiled, 0, c_temp, - program, true, c_do_transpose, false); + ConstantOne<T>(), program, + true, c_do_transpose, false); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventProcessC); } @@ -205,7 +208,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, status = PadCopyTransposeMatrix(event_, eventWaitList, m_ceiled, n_ceiled, m_ceiled, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, - program, false, c_do_transpose, false); + ConstantOne<T>(), program, + false, c_do_transpose, false); if (ErrorIn(status)) { return status; } } diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index e83d105f..5ec1f8cd 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -132,7 +132,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co status = PadCopyTransposeMatrix(eventProcessA1.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, - program, true, ab_rotated, ab_conjugate); + ConstantOne<T>(), program, + true, ab_rotated, ab_conjugate); eventWaitList.push_back(eventProcessA1); if (ErrorIn(status)) { return status; } } @@ -141,7 +142,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co status = PadCopyTransposeMatrix(eventProcessA2.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, - program, true, ab_rotated, !ab_conjugate); + ConstantOne<T>(), program, + true, ab_rotated, !ab_conjugate); eventWaitList.push_back(eventProcessA2); if (ErrorIn(status)) { return status; } } @@ -150,7 +152,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co status = PadCopyTransposeMatrix(eventProcessB1.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, - program, true, ab_rotated, ab_conjugate); + ConstantOne<T>(), program, + true, ab_rotated, ab_conjugate); eventWaitList.push_back(eventProcessB1); if (ErrorIn(status)) { return status; } } @@ -159,7 +162,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co status = PadCopyTransposeMatrix(eventProcessB2.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, - program, true, ab_rotated, !ab_conjugate); + ConstantOne<T>(), program, + true, ab_rotated, !ab_conjugate); eventWaitList.push_back(eventProcessB2); if (ErrorIn(status)) { return status; } } @@ -170,7 +174,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - program, true, c_rotated, false); + ConstantOne<T>(), program, + true, c_rotated, false); eventWaitList.push_back(eventProcessC); if (ErrorIn(status)) { return status; } @@ -222,7 +227,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co status = PadCopyTransposeMatrix(event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - program, false, c_rotated, false, upper, lower, true); + ConstantOne<T>(), program, + false, c_rotated, false, upper, lower, true); if (ErrorIn(status)) { return status; } // Successfully finished the computation diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index 9ab50dd2..df97a94f 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -124,7 +124,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - program, true, a_rotated, a_conjugate); + ConstantOne<T>(), program, + true, a_rotated, a_conjugate); eventWaitList.push_back(eventProcessA); if (ErrorIn(status)) { return status; } } @@ -133,7 +134,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - program, true, a_rotated, b_conjugate); + ConstantOne<T>(), program, + true, a_rotated, b_conjugate); eventWaitList.push_back(eventProcessB); if (ErrorIn(status)) { return status; } } @@ -144,7 +146,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - program, true, c_rotated, false); + ConstantOne<T>(), program, + true, c_rotated, false); eventWaitList.push_back(eventProcessC); if (ErrorIn(status)) { return status; } @@ -180,7 +183,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons status = PadCopyTransposeMatrix(event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - program, false, c_rotated, false, upper, lower, true); + ConstantOne<T>(), program, + false, c_rotated, false, upper, lower, true); if (ErrorIn(status)) { return status; } // Successfully finished the computation diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index 49fbe64b..dd7d19fe 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -124,7 +124,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - program, true, ab_rotated, false); + ConstantOne<T>(), program, + true, ab_rotated, false); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventProcessA); } @@ -133,7 +134,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - program, true, ab_rotated, false); + ConstantOne<T>(), program, + true, ab_rotated, false); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventProcessB); } @@ -144,7 +146,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - program, true, c_rotated, false); + ConstantOne<T>(), program, + true, c_rotated, false); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventProcessC); @@ -193,7 +196,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons status = PadCopyTransposeMatrix(event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - program, false, c_rotated, false, upper, lower, false); + ConstantOne<T>(), program, + false, c_rotated, false, upper, lower, false); if (ErrorIn(status)) { return status; } // Successfully finished the computation diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index 9913c7ca..b5817b82 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -117,7 +117,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - program, true, a_rotated, false); + ConstantOne<T>(), program, + true, a_rotated, false); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventProcessA); } @@ -128,7 +129,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - program, true, c_rotated, false); + ConstantOne<T>(), program, + true, c_rotated, false); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventProcessC); @@ -164,7 +166,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const status = PadCopyTransposeMatrix(event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - program, false, c_rotated, false, upper, lower, false); + ConstantOne<T>(), program, + false, c_rotated, false, upper, lower, false); if (ErrorIn(status)) { return status; } diff --git a/src/routines/levelx/xomatcopy.cc b/src/routines/levelx/xomatcopy.cc new file mode 100644 index 00000000..77fc445f --- /dev/null +++ b/src/routines/levelx/xomatcopy.cc @@ -0,0 +1,103 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xomatcopy class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/levelx/xomatcopy.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xomatcopy<half>::precision_ = Precision::kHalf; +template <> const Precision Xomatcopy<float>::precision_ = Precision::kSingle; +template <> const Precision Xomatcopy<double>::precision_ = Precision::kDouble; +template <> const Precision Xomatcopy<float2>::precision_ = Precision::kComplexSingle; +template <> const Precision Xomatcopy<double2>::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xomatcopy<T>::Xomatcopy(Queue &queue, EventPointer event, const std::string &name): + Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, precision_) { + source_string_ = + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } + + // Determines whether to transpose the matrix A + const auto transpose = (a_transpose != Transpose::kNo); + + // In case of complex data-types, the transpose can also become a conjugate transpose + const auto conjugate = (a_transpose == Transpose::kConjugate); + + // Computes the dimensions of the two matrices + const auto rotated = (layout == Layout::kRowMajor); + const auto a_one = (rotated) ? n : m; + const auto a_two = (rotated) ? m : n; + const auto b_one = (transpose) ? a_two : a_one; + const auto b_two = (transpose) ? a_one : a_two; + + // Tests the matrices for validity, first from a perspective of the OpenCL buffers and their + // sizes, and then from a perspective of parameter values (e.g. m, n). Tests whether the OpenCL + // buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage space. + // Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than M when not-rotated + // matrix B cannot be less than M when rotated, or less than N when not-rotated + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Loads the program from the database + const auto program = GetProgramFromCache(); + + auto emptyEventList = std::vector<Event>(); + status = PadCopyTransposeMatrix(event_, emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + b_one, b_two, b_ld, b_offset, b_buffer, + alpha, program, false, transpose, conjugate); + if (ErrorIn(status)) { return status; } + + return StatusCode::kSuccess; +} + +// ================================================================================================= + +// Compiles the templated class +template class Xomatcopy<half>; +template class Xomatcopy<float>; +template class Xomatcopy<double>; +template class Xomatcopy<float2>; +template class Xomatcopy<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/tuning/copy_fast.cc b/src/tuning/copy_fast.cc index 2da707be..09fdbaba 100644 --- a/src/tuning/copy_fast.cc +++ b/src/tuning/copy_fast.cc @@ -37,7 +37,7 @@ class TuneCopy { } // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; } + static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; } // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } @@ -86,9 +86,11 @@ class TuneCopy { std::vector<T> &, std::vector<T> &, std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &, std::vector<T> &) { + auto alpha_buffer = std::vector<T>{args.alpha}; tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentInput(alpha_buffer); } // Describes how to compute the performance metrics diff --git a/src/tuning/copy_pad.cc b/src/tuning/copy_pad.cc index ec392471..7088b3bf 100644 --- a/src/tuning/copy_pad.cc +++ b/src/tuning/copy_pad.cc @@ -37,7 +37,7 @@ class TunePad { } // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; } + static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; } // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } @@ -86,6 +86,7 @@ class TunePad { std::vector<T> &, std::vector<T> &, std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &, std::vector<T> &) { + auto alpha_buffer = std::vector<T>{args.alpha}; tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentScalar(static_cast<int>(args.n)); tuner.AddArgumentScalar(static_cast<int>(args.m)); @@ -96,6 +97,7 @@ class TunePad { tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentScalar(0); tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentInput(alpha_buffer); tuner.AddArgumentScalar(0); } diff --git a/src/tuning/transpose_fast.cc b/src/tuning/transpose_fast.cc index 1a5260c1..3b0bdeb5 100644 --- a/src/tuning/transpose_fast.cc +++ b/src/tuning/transpose_fast.cc @@ -37,7 +37,7 @@ class TuneTranspose { } // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; } + static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; } // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } @@ -91,9 +91,11 @@ class TuneTranspose { std::vector<T> &, std::vector<T> &, std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &, std::vector<T> &) { + auto alpha_buffer = std::vector<T>{args.alpha}; tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentInput(alpha_buffer); } // Describes how to compute the performance metrics diff --git a/src/tuning/transpose_pad.cc b/src/tuning/transpose_pad.cc index 08b52510..b9ab3ffa 100644 --- a/src/tuning/transpose_pad.cc +++ b/src/tuning/transpose_pad.cc @@ -37,7 +37,7 @@ class TunePadTranspose { } // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; } + static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; } // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } @@ -90,6 +90,7 @@ class TunePadTranspose { std::vector<T> &, std::vector<T> &, std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &, std::vector<T> &) { + auto alpha_buffer = std::vector<T>{args.alpha}; tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentScalar(static_cast<int>(args.n)); tuner.AddArgumentScalar(static_cast<int>(args.m)); @@ -100,6 +101,7 @@ class TunePadTranspose { tuner.AddArgumentScalar(static_cast<int>(args.n)); tuner.AddArgumentScalar(0); tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentInput(alpha_buffer); tuner.AddArgumentScalar(0); } diff --git a/test/correctness/routines/levelx/xomatcopy.cc b/test/correctness/routines/levelx/xomatcopy.cc new file mode 100644 index 00000000..43021bc9 --- /dev/null +++ b/test/correctness/routines/levelx/xomatcopy.cc @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/levelx/xomatcopy.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests<clblast::TestXomatcopy<float>, float, float>(argc, argv, false, "SOMATCOPY"); + errors += clblast::RunTests<clblast::TestXomatcopy<double>, double, double>(argc, argv, true, "DOMATCOPY"); + errors += clblast::RunTests<clblast::TestXomatcopy<float2>, float2, float2>(argc, argv, true, "COMATCOPY"); + errors += clblast::RunTests<clblast::TestXomatcopy<double2>, double2, double2>(argc, argv, true, "ZOMATCOPY"); + errors += clblast::RunTests<clblast::TestXomatcopy<half>, half, half>(argc, argv, true, "HOMATCOPY"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index e5addc48..9f842d91 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -174,8 +174,8 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st template <typename T, typename U> void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name) { if (!PrecisionSupported<T>(device_)) { return; } - if (!compare_clblas_) { return; } - if (std::is_same<T, half>::value) { return; } + if (!compare_clblas_) { return; } // not supported for CPU BLAS routines + if (std::is_same<T, half>::value) { return; } // not supported for half-precision TestStart("invalid buffer sizes", name); // Iterates over all the to-be-tested combinations of arguments diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index cce10751..e849466a 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -129,6 +129,13 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na const auto reference_routine2 = C::RunReference2; // CBLAS #endif + // Non-BLAS routines cannot be fully tested + if (!silent && C::BLASLevel() == 4) { + fprintf(stdout, "\n* NOTE: This non-BLAS routine is tested against a custom implementation,\n"); + fprintf(stdout, " not against clBLAS or a CPU BLAS library. Thus, the arguments '-clblas'\n"); + fprintf(stdout, " and '-cblas' have no effect.\n"); + } + // Creates a tester auto options = C::GetOptions(); TestBlas<T,U> tester{argc, argv, silent, name, options, @@ -176,8 +183,9 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na auto ap_sizes = std::vector<size_t>{args.ap_size}; // Sets the dimensions of the matrices or vectors depending on the BLAS level - auto dimensions = (C::BLASLevel() == 3) ? tester.kMatrixDims : - (C::BLASLevel() == 2) ? tester.kMatrixVectorDims : + auto dimensions = (C::BLASLevel() == 4) ? tester.kMatrixDims : // non-BLAS extra routines + (C::BLASLevel() == 3) ? tester.kMatrixDims : // level 3 + (C::BLASLevel() == 2) ? tester.kMatrixVectorDims : // level 2 tester.kVectorDims; // else: level 1 // For the options relevant to this routine, sets the vectors to proper values @@ -318,7 +326,9 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na // Runs the tests tester.TestRegular(regular_test_vector, case_name); #ifdef CLBLAST_REF_CLBLAS - tester.TestInvalid(invalid_test_vector, case_name); + if (C::BLASLevel() != 4) { + tester.TestInvalid(invalid_test_vector, case_name); + } #endif } } diff --git a/test/performance/client.cc b/test/performance/client.cc index 5a7226df..4c0c79a6 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -42,8 +42,10 @@ Client<T,U>::Client(const Routine run_routine, // applicable, but are searched for anyway to be able to create one common argument parser. All // arguments have a default value in case they are not found. template <typename T, typename U> -Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric default_a_ld, - const GetMetric default_b_ld, const GetMetric default_c_ld) { +Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t level, + const GetMetric default_a_ld, + const GetMetric default_b_ld, + const GetMetric default_c_ld) { auto args = Arguments<U>{}; auto help = std::string{"\n* Options given/available:\n"}; @@ -116,6 +118,17 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric // which is thus always displayed (unless silence is specified). if (!args.silent) { fprintf(stdout, "%s\n", help.c_str()); } + // Comparison against a non-BLAS routine is not supported + if (level == 4) { // level-4 == level-X + if (args.compare_clblas != 0 || args.compare_cblas != 0) { + if (!args.silent) { + fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for this non-BLAS routine\n\n"); + } + } + args.compare_clblas = 0; + args.compare_cblas = 0; + } + // Comparison against clBLAS or a CPU BLAS library is not supported in case of half-precision if (args.precision == Precision::kHalf) { if (args.compare_clblas != 0 || args.compare_cblas != 0) { diff --git a/test/performance/client.h b/test/performance/client.h index 8d0597d7..493a7aed 100644 --- a/test/performance/client.h +++ b/test/performance/client.h @@ -53,8 +53,10 @@ class Client { // Parses all command-line arguments, filling in the arguments structure. If no command-line // argument is given for a particular argument, it is filled in with a default value. - Arguments<U> ParseArguments(int argc, char *argv[], const GetMetric default_a_ld, - const GetMetric default_b_ld, const GetMetric default_c_ld); + Arguments<U> ParseArguments(int argc, char *argv[], const size_t level, + const GetMetric default_a_ld, + const GetMetric default_b_ld, + const GetMetric default_c_ld); // The main client function, setting-up arguments, matrices, OpenCL buffers, etc. After set-up, it // calls the client routines. @@ -97,14 +99,14 @@ void RunClient(int argc, char *argv[]) { // Sets the reference to test against #ifdef CLBLAST_REF_CLBLAS - const auto reference1 = C::RunReference1; // clBLAS when available + auto reference1 = C::RunReference1; // clBLAS when available #else - const auto reference1 = ReferenceNotAvailable<T,U>; + auto reference1 = ReferenceNotAvailable<T,U>; #endif #ifdef CLBLAST_REF_CBLAS - const auto reference2 = C::RunReference2; // CBLAS when available + auto reference2 = C::RunReference2; // CBLAS when available #else - const auto reference2 = ReferenceNotAvailable<T,U>; + auto reference2 = ReferenceNotAvailable<T,U>; #endif // Creates a new client @@ -112,7 +114,8 @@ void RunClient(int argc, char *argv[]) { C::GetFlops, C::GetBytes); // Simple command line argument parser with defaults - auto args = client.ParseArguments(argc, argv, C::DefaultLDA, C::DefaultLDB, C::DefaultLDC); + auto args = client.ParseArguments(argc, argv, C::BLASLevel(), + C::DefaultLDA, C::DefaultLDB, C::DefaultLDC); if (args.print_help) { return; } // Runs the client diff --git a/test/performance/routines/levelx/xomatcopy.cc b/test/performance/routines/levelx/xomatcopy.cc new file mode 100644 index 00000000..851f6ee1 --- /dev/null +++ b/test/performance/routines/levelx/xomatcopy.cc @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/levelx/xomatcopy.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXomatcopy<half>, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient<clblast::TestXomatcopy<float>, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient<clblast::TestXomatcopy<double>, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient<clblast::TestXomatcopy<float2>, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient<clblast::TestXomatcopy<double2>, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/routines/levelx/xomatcopy.h b/test/routines/levelx/xomatcopy.h new file mode 100644 index 00000000..c499ba75 --- /dev/null +++ b/test/routines/levelx/xomatcopy.h @@ -0,0 +1,164 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a class with static methods to describe the Xomatcopy routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XOMATCOPY_H_ +#define CLBLAST_TEST_ROUTINES_XOMATCOPY_H_ + +#include <vector> +#include <string> + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class TestXomatcopy { + public: + + // The BLAS level: 4 for the extra routines + static size_t BLASLevel() { return 4; } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgM, kArgN, + kArgLayout, kArgATransp, + kArgALeadDim, kArgBLeadDim, + kArgAOffset, kArgBOffset, + kArgAlpha}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments<T> &args) { + const auto a_rotated = (args.layout == Layout::kRowMajor); + const auto a_two = (a_rotated) ? args.m : args.n; + return a_two * args.a_ld + args.a_offset; + } + static size_t GetSizeB(const Arguments<T> &args) { + const auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + const auto b_two = (b_rotated) ? args.n : args.m; + return b_two * args.b_ld + args.b_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments<T> &args) { + args.a_size = GetSizeA(args); + args.b_size = GetSizeB(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments<T> &args) { return args.n; } + static size_t DefaultLDB(const Arguments<T> &args) { return args.m; } + static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine + + // Describes which omatcopyose options are relevant for this routine + using Transposes = std::vector<Transpose>; + static Transposes GetATransposes(const Transposes &all) { return all; } + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Omatcopy<T>(args.layout, args.a_transpose, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run a naive version of the routine (for correctness/performance comparison). + // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. + + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + return RunReference2(args, buffers, queue); + } + #endif + + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + + // Data transfer from OpenCL to std::vector + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + + // Checking for invalid arguments + const auto a_rotated = (args.layout == Layout::kRowMajor); + const auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + const auto a_base = (a_rotated) ? args.a_ld*(args.m-1) + args.n : args.a_ld*(args.n-1) + args.m; + const auto b_base = (b_rotated) ? args.b_ld*(args.m-1) + args.n : args.b_ld*(args.n-1) + args.m; + if ((args.m == 0) || (args.n == 0)) { return StatusCode::kInvalidDimension; } + if ((args.a_ld < args.m && !a_rotated) || (args.a_ld < args.n && a_rotated)) { return StatusCode::kInvalidLeadDimA; } + if ((args.b_ld < args.m && !b_rotated) || (args.b_ld < args.n && b_rotated)) { return StatusCode::kInvalidLeadDimB; } + if (buffers.a_mat.GetSize() < (a_base + args.a_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryA; } + if (buffers.b_mat.GetSize() < (b_base + args.b_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryB; } + + // Matrix copy, scaling, and/or transpose + for (auto id1 = size_t{0}; id1 < args.m; ++id1) { + for (auto id2 = size_t{0}; id2 < args.n; ++id2) { + const auto a_one = (a_rotated) ? id2 : id1; + const auto a_two = (a_rotated) ? id1 : id2; + const auto b_one = (b_rotated) ? id2 : id1; + const auto b_two = (b_rotated) ? id1 : id2; + const auto a_index = a_two * args.a_ld + a_one + args.a_offset; + const auto b_index = b_two * args.b_ld + b_one + args.b_offset; + b_mat_cpu[b_index] = args.alpha * a_mat_cpu[a_index]; + } + } + + // Data transfer back to OpenCL + buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); + return StatusCode::kSuccess; + } + #endif + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> result(args.b_size, static_cast<T>(0)); + buffers.b_mat.Read(queue, args.b_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments<T> &args) { return args.m; } + static size_t ResultID2(const Arguments<T> &args) { return args.n; } + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) { + const auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + const auto b_one = (b_rotated) ? id2 : id1; + const auto b_two = (b_rotated) ? id1 : id2; + return b_two * args.b_ld + b_one + args.b_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments<T> &args) { + return args.m*args.n; + } + static size_t GetBytes(const Arguments<T> &args) { + return (2*args.m*args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XOMATCOPY_H_ +#endif |