From f726fbdc9fef937fbe32222f0e66aac8d7e2678c Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 18 Jun 2016 20:20:13 +0200 Subject: Moved all headers into the source tree, changed headers to .hpp extension --- src/buffer_test.hpp | 121 ++++++ src/cache.cc | 2 +- src/cache.hpp | 98 +++++ src/clblast.cc | 92 ++--- src/clblast_c.cc | 2 +- src/clpp11.hpp | 695 ++++++++++++++++++++++++++++++++++ src/database.cc | 120 ------ src/database/database.cc | 120 ++++++ src/database/database.hpp | 104 +++++ src/database/kernels/copy.hpp | 262 +++++++++++++ src/database/kernels/pad.hpp | 270 +++++++++++++ src/database/kernels/padtranspose.hpp | 270 +++++++++++++ src/database/kernels/transpose.hpp | 258 +++++++++++++ src/database/kernels/xaxpy.hpp | 270 +++++++++++++ src/database/kernels/xdot.hpp | 200 ++++++++++ src/database/kernels/xgemm.hpp | 263 +++++++++++++ src/database/kernels/xgemv.hpp | 231 +++++++++++ src/database/kernels/xger.hpp | 220 +++++++++++ src/public_api.hpp | 34 ++ src/routine.cc | 2 +- src/routine.hpp | 68 ++++ src/routines/common.cc | 2 +- src/routines/common.hpp | 173 +++++++++ src/routines/level1/xamax.cc | 2 +- src/routines/level1/xamax.hpp | 40 ++ src/routines/level1/xasum.cc | 2 +- src/routines/level1/xasum.hpp | 40 ++ src/routines/level1/xaxpy.cc | 2 +- src/routines/level1/xaxpy.hpp | 40 ++ src/routines/level1/xcopy.cc | 2 +- src/routines/level1/xcopy.hpp | 40 ++ src/routines/level1/xdot.cc | 2 +- src/routines/level1/xdot.hpp | 42 ++ src/routines/level1/xdotc.cc | 2 +- src/routines/level1/xdotc.hpp | 44 +++ src/routines/level1/xdotu.cc | 2 +- src/routines/level1/xdotu.hpp | 44 +++ src/routines/level1/xmax.hpp | 49 +++ src/routines/level1/xmin.hpp | 49 +++ src/routines/level1/xnrm2.cc | 2 +- src/routines/level1/xnrm2.hpp | 40 ++ src/routines/level1/xscal.cc | 2 +- src/routines/level1/xscal.hpp | 39 ++ src/routines/level1/xsum.hpp | 49 +++ src/routines/level1/xswap.cc | 2 +- src/routines/level1/xswap.hpp | 40 ++ src/routines/level2/xgbmv.cc | 2 +- src/routines/level2/xgbmv.hpp | 49 +++ src/routines/level2/xgemv.cc | 2 +- src/routines/level2/xgemv.hpp | 56 +++ src/routines/level2/xger.cc | 2 +- src/routines/level2/xger.hpp | 43 +++ src/routines/level2/xgerc.cc | 2 +- src/routines/level2/xgerc.hpp | 46 +++ src/routines/level2/xgeru.cc | 2 +- src/routines/level2/xgeru.hpp | 46 +++ src/routines/level2/xhbmv.cc | 2 +- src/routines/level2/xhbmv.hpp | 49 +++ src/routines/level2/xhemv.cc | 2 +- src/routines/level2/xhemv.hpp | 49 +++ src/routines/level2/xher.cc | 2 +- src/routines/level2/xher.hpp | 46 +++ src/routines/level2/xher2.cc | 2 +- src/routines/level2/xher2.hpp | 44 +++ src/routines/level2/xhpmv.cc | 2 +- src/routines/level2/xhpmv.hpp | 49 +++ src/routines/level2/xhpr.cc | 2 +- src/routines/level2/xhpr.hpp | 45 +++ src/routines/level2/xhpr2.cc | 2 +- src/routines/level2/xhpr2.hpp | 46 +++ src/routines/level2/xsbmv.cc | 2 +- src/routines/level2/xsbmv.hpp | 49 +++ src/routines/level2/xspmv.cc | 2 +- src/routines/level2/xspmv.hpp | 49 +++ src/routines/level2/xspr.cc | 2 +- src/routines/level2/xspr.hpp | 45 +++ src/routines/level2/xspr2.cc | 2 +- src/routines/level2/xspr2.hpp | 46 +++ src/routines/level2/xsymv.cc | 2 +- src/routines/level2/xsymv.hpp | 49 +++ src/routines/level2/xsyr.cc | 2 +- src/routines/level2/xsyr.hpp | 45 +++ src/routines/level2/xsyr2.cc | 2 +- src/routines/level2/xsyr2.hpp | 46 +++ src/routines/level2/xtbmv.cc | 2 +- src/routines/level2/xtbmv.hpp | 49 +++ src/routines/level2/xtpmv.cc | 2 +- src/routines/level2/xtpmv.hpp | 49 +++ src/routines/level2/xtrmv.cc | 2 +- src/routines/level2/xtrmv.hpp | 49 +++ src/routines/level3/xgemm.cc | 2 +- src/routines/level3/xgemm.hpp | 48 +++ src/routines/level3/xhemm.cc | 2 +- src/routines/level3/xhemm.hpp | 54 +++ src/routines/level3/xher2k.cc | 2 +- src/routines/level3/xher2k.hpp | 46 +++ src/routines/level3/xherk.cc | 2 +- src/routines/level3/xherk.hpp | 45 +++ src/routines/level3/xsymm.cc | 2 +- src/routines/level3/xsymm.hpp | 56 +++ src/routines/level3/xsyr2k.cc | 2 +- src/routines/level3/xsyr2k.hpp | 46 +++ src/routines/level3/xsyrk.cc | 2 +- src/routines/level3/xsyrk.hpp | 47 +++ src/routines/level3/xtrmm.cc | 2 +- src/routines/level3/xtrmm.hpp | 54 +++ src/routines/levelx/xomatcopy.cc | 2 +- src/routines/levelx/xomatcopy.hpp | 41 ++ src/tuning/copy_fast.cc | 122 ------ src/tuning/copy_pad.cc | 130 ------- src/tuning/kernels/copy_fast.cc | 122 ++++++ src/tuning/kernels/copy_pad.cc | 130 +++++++ src/tuning/kernels/transpose_fast.cc | 127 +++++++ src/tuning/kernels/transpose_pad.cc | 134 +++++++ src/tuning/kernels/xaxpy.cc | 125 ++++++ src/tuning/kernels/xdot.cc | 137 +++++++ src/tuning/kernels/xgemm.cc | 162 ++++++++ src/tuning/kernels/xgemv.cc | 156 ++++++++ src/tuning/kernels/xger.cc | 130 +++++++ src/tuning/transpose_fast.cc | 127 ------- src/tuning/transpose_pad.cc | 134 ------- src/tuning/tuning.hpp | 161 ++++++++ src/tuning/xaxpy.cc | 125 ------ src/tuning/xdot.cc | 137 ------- src/tuning/xgemm.cc | 162 -------- src/tuning/xgemv.cc | 156 -------- src/tuning/xger.cc | 130 ------- src/utilities.cc | 2 +- src/utilities.hpp | 257 +++++++++++++ 129 files changed, 7427 insertions(+), 1435 deletions(-) create mode 100644 src/buffer_test.hpp create mode 100644 src/cache.hpp create mode 100644 src/clpp11.hpp delete mode 100644 src/database.cc create mode 100644 src/database/database.cc create mode 100644 src/database/database.hpp create mode 100644 src/database/kernels/copy.hpp create mode 100644 src/database/kernels/pad.hpp create mode 100644 src/database/kernels/padtranspose.hpp create mode 100644 src/database/kernels/transpose.hpp create mode 100644 src/database/kernels/xaxpy.hpp create mode 100644 src/database/kernels/xdot.hpp create mode 100644 src/database/kernels/xgemm.hpp create mode 100644 src/database/kernels/xgemv.hpp create mode 100644 src/database/kernels/xger.hpp create mode 100644 src/public_api.hpp create mode 100644 src/routine.hpp create mode 100644 src/routines/common.hpp create mode 100644 src/routines/level1/xamax.hpp create mode 100644 src/routines/level1/xasum.hpp create mode 100644 src/routines/level1/xaxpy.hpp create mode 100644 src/routines/level1/xcopy.hpp create mode 100644 src/routines/level1/xdot.hpp create mode 100644 src/routines/level1/xdotc.hpp create mode 100644 src/routines/level1/xdotu.hpp create mode 100644 src/routines/level1/xmax.hpp create mode 100644 src/routines/level1/xmin.hpp create mode 100644 src/routines/level1/xnrm2.hpp create mode 100644 src/routines/level1/xscal.hpp create mode 100644 src/routines/level1/xsum.hpp create mode 100644 src/routines/level1/xswap.hpp create mode 100644 src/routines/level2/xgbmv.hpp create mode 100644 src/routines/level2/xgemv.hpp create mode 100644 src/routines/level2/xger.hpp create mode 100644 src/routines/level2/xgerc.hpp create mode 100644 src/routines/level2/xgeru.hpp create mode 100644 src/routines/level2/xhbmv.hpp create mode 100644 src/routines/level2/xhemv.hpp create mode 100644 src/routines/level2/xher.hpp create mode 100644 src/routines/level2/xher2.hpp create mode 100644 src/routines/level2/xhpmv.hpp create mode 100644 src/routines/level2/xhpr.hpp create mode 100644 src/routines/level2/xhpr2.hpp create mode 100644 src/routines/level2/xsbmv.hpp create mode 100644 src/routines/level2/xspmv.hpp create mode 100644 src/routines/level2/xspr.hpp create mode 100644 src/routines/level2/xspr2.hpp create mode 100644 src/routines/level2/xsymv.hpp create mode 100644 src/routines/level2/xsyr.hpp create mode 100644 src/routines/level2/xsyr2.hpp create mode 100644 src/routines/level2/xtbmv.hpp create mode 100644 src/routines/level2/xtpmv.hpp create mode 100644 src/routines/level2/xtrmv.hpp create mode 100644 src/routines/level3/xgemm.hpp create mode 100644 src/routines/level3/xhemm.hpp create mode 100644 src/routines/level3/xher2k.hpp create mode 100644 src/routines/level3/xherk.hpp create mode 100644 src/routines/level3/xsymm.hpp create mode 100644 src/routines/level3/xsyr2k.hpp create mode 100644 src/routines/level3/xsyrk.hpp create mode 100644 src/routines/level3/xtrmm.hpp create mode 100644 src/routines/levelx/xomatcopy.hpp delete mode 100644 src/tuning/copy_fast.cc delete mode 100644 src/tuning/copy_pad.cc create mode 100644 src/tuning/kernels/copy_fast.cc create mode 100644 src/tuning/kernels/copy_pad.cc create mode 100644 src/tuning/kernels/transpose_fast.cc create mode 100644 src/tuning/kernels/transpose_pad.cc create mode 100644 src/tuning/kernels/xaxpy.cc create mode 100644 src/tuning/kernels/xdot.cc create mode 100644 src/tuning/kernels/xgemm.cc create mode 100644 src/tuning/kernels/xgemv.cc create mode 100644 src/tuning/kernels/xger.cc delete mode 100644 src/tuning/transpose_fast.cc delete mode 100644 src/tuning/transpose_pad.cc create mode 100644 src/tuning/tuning.hpp delete mode 100644 src/tuning/xaxpy.cc delete mode 100644 src/tuning/xdot.cc delete mode 100644 src/tuning/xgemm.cc delete mode 100644 src/tuning/xgemv.cc delete mode 100644 src/tuning/xger.cc create mode 100644 src/utilities.hpp (limited to 'src') diff --git a/src/buffer_test.hpp b/src/buffer_test.hpp new file mode 100644 index 00000000..80f5243f --- /dev/null +++ b/src/buffer_test.hpp @@ -0,0 +1,121 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are +// templated and thus header-only. +// +// ================================================================================================= + +#ifndef CLBLAST_BUFFER_TEST_H_ +#define CLBLAST_BUFFER_TEST_H_ + +#include "clblast.h" + +namespace clblast { +// ================================================================================================= + +// Tests matrix 'A' for validity +template +StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer &buffer, + const size_t offset, const size_t ld) { + if (ld < one) { return StatusCode::kInvalidLeadDimA; } + try { + const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); + if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; } + } catch (...) { return StatusCode::kInvalidMatrixA; } + return StatusCode::kSuccess; +} + +// Tests matrix 'B' for validity +template +StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer &buffer, + const size_t offset, const size_t ld) { + if (ld < one) { return StatusCode::kInvalidLeadDimB; } + try { + const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); + if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; } + } catch (...) { return StatusCode::kInvalidMatrixB; } + return StatusCode::kSuccess; +} + +// Tests matrix 'C' for validity +template +StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer &buffer, + const size_t offset, const size_t ld) { + if (ld < one) { return StatusCode::kInvalidLeadDimC; } + try { + const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); + if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; } + } catch (...) { return StatusCode::kInvalidMatrixC; } + return StatusCode::kSuccess; +} + +// Tests matrix 'AP' for validity +template +StatusCode TestMatrixAP(const size_t n, const Buffer &buffer, const size_t offset) { + try { + const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T); + if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; } + } catch (...) { return StatusCode::kInvalidMatrixA; } + return StatusCode::kSuccess; +} + +// ================================================================================================= + +// Tests vector 'X' for validity +template +StatusCode TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, + const size_t inc) { + if (inc == 0) { return StatusCode::kInvalidIncrementX; } + try { + const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T); + if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; } + } catch (...) { return StatusCode::kInvalidVectorX; } + return StatusCode::kSuccess; +} + +// Tests vector 'Y' for validity +template +StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, + const size_t inc) { + if (inc == 0) { return StatusCode::kInvalidIncrementY; } + try { + const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T); + if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; } + } catch (...) { return StatusCode::kInvalidVectorY; } + return StatusCode::kSuccess; +} + +// ================================================================================================= + +// Tests vector 'scalar' for validity +template +StatusCode TestVectorScalar(const size_t n, const Buffer &buffer, const size_t offset) { + try { + const auto required_size = (n + offset) * sizeof(T); + if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; } + } catch (...) { return StatusCode::kInvalidVectorScalar; } + return StatusCode::kSuccess; +} + +// Tests vector 'index' for validity +template +StatusCode TestVectorIndex(const size_t n, const Buffer &buffer, const size_t offset) { + try { + const auto required_size = (n + offset) * sizeof(T); + if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; } + } catch (...) { return StatusCode::kInvalidVectorScalar; } + return StatusCode::kSuccess; +} + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_BUFFER_TEST_H_ +#endif diff --git a/src/cache.cc b/src/cache.cc index a34d351f..cd9055d0 100644 --- a/src/cache.cc +++ b/src/cache.cc @@ -15,7 +15,7 @@ #include #include -#include "internal/cache.h" +#include "cache.hpp" namespace clblast { // ================================================================================================= diff --git a/src/cache.hpp b/src/cache.hpp new file mode 100644 index 00000000..0d74d7bc --- /dev/null +++ b/src/cache.hpp @@ -0,0 +1,98 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the caching functionality of compiled binaries and programs. +// +// ================================================================================================= + +#ifndef CLBLAST_CACHE_H_ +#define CLBLAST_CACHE_H_ + +#include +#include +#include + +#include "utilities.hpp" + +namespace clblast { +// ================================================================================================= + +// The cache of compiled OpenCL binaries, along with some meta-data +struct BinaryCache { + std::string binary; + std::string device_name; + Precision precision; + std::string routine_name_; + + // Finds out whether the properties match + bool MatchInCache(const std::string &ref_device, const Precision &ref_precision, + const std::string &ref_routine) { + return (device_name == ref_device && + precision == ref_precision && + routine_name_ == ref_routine); + } +}; + +// The actual cache, implemented as a vector of the above data-type, and its mutex +static std::vector binary_cache_; +static std::mutex binary_cache_mutex_; + +// ================================================================================================= + +// The cache of compiled OpenCL programs, along with some meta-data +struct ProgramCache { + Program program; + ContextPointer context_ptr; + Precision precision; + std::string routine_name_; + + // Finds out whether the properties match + bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision, + const std::string &ref_routine) { + return (context_ptr == ref_context && + precision == ref_precision && + routine_name_ == ref_routine); + } +}; + +// The actual cache, implemented as a vector of the above data-type, and its mutex +static std::vector program_cache_; +static std::mutex program_cache_mutex_; + +// ================================================================================================= + +// Stores the compiled binary or program in the cache +void StoreBinaryToCache(const std::string &binary, const std::string &device_name, + const Precision &precision, const std::string &routine_name); +void StoreProgramToCache(const Program &program, const Context &context, + const Precision &precision, const std::string &routine_name); + +// Queries the cache and retrieves a matching binary or program. Assumes that the match is +// available, throws otherwise. +const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name); +const Program& GetProgramFromCache(const Context &context, const Precision &precision, + const std::string &routine_name); + +// Queries the cache to see whether or not the compiled kernel is already there +bool BinaryIsInCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name); +bool ProgramIsInCache(const Context &context, const Precision &precision, + const std::string &routine_name); + +// ================================================================================================= + +// Clears the cache of stored binaries +StatusCode CacheClearAll(); + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_CACHE_H_ +#endif diff --git a/src/clblast.cc b/src/clblast.cc index d0f0c937..88d60772 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -16,60 +16,60 @@ #include #include "clblast.h" -#include "internal/public_api.h" -#include "internal/cache.h" +#include "public_api.hpp" +#include "cache.hpp" // BLAS level-1 includes -#include "internal/routines/level1/xswap.h" -#include "internal/routines/level1/xscal.h" -#include "internal/routines/level1/xcopy.h" -#include "internal/routines/level1/xaxpy.h" -#include "internal/routines/level1/xdot.h" -#include "internal/routines/level1/xdotu.h" -#include "internal/routines/level1/xdotc.h" -#include "internal/routines/level1/xnrm2.h" -#include "internal/routines/level1/xasum.h" -#include "internal/routines/level1/xsum.h" // non-BLAS routine -#include "internal/routines/level1/xamax.h" -#include "internal/routines/level1/xmax.h" // non-BLAS routine -#include "internal/routines/level1/xmin.h" // non-BLAS routine +#include "routines/level1/xswap.hpp" +#include "routines/level1/xscal.hpp" +#include "routines/level1/xcopy.hpp" +#include "routines/level1/xaxpy.hpp" +#include "routines/level1/xdot.hpp" +#include "routines/level1/xdotu.hpp" +#include "routines/level1/xdotc.hpp" +#include "routines/level1/xnrm2.hpp" +#include "routines/level1/xasum.hpp" +#include "routines/level1/xsum.hpp" // non-BLAS routine +#include "routines/level1/xamax.hpp" +#include "routines/level1/xmax.hpp" // non-BLAS routine +#include "routines/level1/xmin.hpp" // non-BLAS routine // BLAS level-2 includes -#include "internal/routines/level2/xgemv.h" -#include "internal/routines/level2/xgbmv.h" -#include "internal/routines/level2/xhemv.h" -#include "internal/routines/level2/xhbmv.h" -#include "internal/routines/level2/xhpmv.h" -#include "internal/routines/level2/xsymv.h" -#include "internal/routines/level2/xsbmv.h" -#include "internal/routines/level2/xspmv.h" -#include "internal/routines/level2/xtrmv.h" -#include "internal/routines/level2/xtbmv.h" -#include "internal/routines/level2/xtpmv.h" -#include "internal/routines/level2/xger.h" -#include "internal/routines/level2/xgeru.h" -#include "internal/routines/level2/xgerc.h" -#include "internal/routines/level2/xher.h" -#include "internal/routines/level2/xhpr.h" -#include "internal/routines/level2/xher2.h" -#include "internal/routines/level2/xhpr2.h" -#include "internal/routines/level2/xsyr.h" -#include "internal/routines/level2/xspr.h" -#include "internal/routines/level2/xsyr2.h" -#include "internal/routines/level2/xspr2.h" +#include "routines/level2/xgemv.hpp" +#include "routines/level2/xgbmv.hpp" +#include "routines/level2/xhemv.hpp" +#include "routines/level2/xhbmv.hpp" +#include "routines/level2/xhpmv.hpp" +#include "routines/level2/xsymv.hpp" +#include "routines/level2/xsbmv.hpp" +#include "routines/level2/xspmv.hpp" +#include "routines/level2/xtrmv.hpp" +#include "routines/level2/xtbmv.hpp" +#include "routines/level2/xtpmv.hpp" +#include "routines/level2/xger.hpp" +#include "routines/level2/xgeru.hpp" +#include "routines/level2/xgerc.hpp" +#include "routines/level2/xher.hpp" +#include "routines/level2/xhpr.hpp" +#include "routines/level2/xher2.hpp" +#include "routines/level2/xhpr2.hpp" +#include "routines/level2/xsyr.hpp" +#include "routines/level2/xspr.hpp" +#include "routines/level2/xsyr2.hpp" +#include "routines/level2/xspr2.hpp" // BLAS level-3 includes -#include "internal/routines/level3/xgemm.h" -#include "internal/routines/level3/xsymm.h" -#include "internal/routines/level3/xhemm.h" -#include "internal/routines/level3/xsyrk.h" -#include "internal/routines/level3/xherk.h" -#include "internal/routines/level3/xsyr2k.h" -#include "internal/routines/level3/xher2k.h" -#include "internal/routines/level3/xtrmm.h" +#include "routines/level3/xgemm.hpp" +#include "routines/level3/xsymm.hpp" +#include "routines/level3/xhemm.hpp" +#include "routines/level3/xsyrk.hpp" +#include "routines/level3/xherk.hpp" +#include "routines/level3/xsyr2k.hpp" +#include "routines/level3/xher2k.hpp" +#include "routines/level3/xtrmm.hpp" // Level-x includes (non-BLAS) -#include "internal/routines/levelx/xomatcopy.h" +#include "routines/levelx/xomatcopy.hpp" namespace clblast { diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 22cb2192..9ea2c884 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -15,7 +15,7 @@ #include "clblast_c.h" #include "clblast.h" -#include "internal/utilities.h" +#include "utilities.hpp" // Shortcuts to the clblast namespace using float2 = clblast::float2; diff --git a/src/clpp11.hpp b/src/clpp11.hpp new file mode 100644 index 00000000..b834d8b4 --- /dev/null +++ b/src/clpp11.hpp @@ -0,0 +1,695 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API +// calls. The main benefits are increased abstraction, automatic memory management, and portability. +// Portability here means that a similar header exists for CUDA with the same classes and +// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. +// +// This file is taken from the Claduc project and therefore +// contains the following header copyright notice: +// +// ================================================================================================= +// +// Copyright 2015 SURFsara +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ================================================================================================= + +#ifndef CLBLAST_CLPP11_H_ +#define CLBLAST_CLPP11_H_ + +// C++ +#include // std::copy +#include // std::string +#include // std::vector +#include // std::shared_ptr +#include // std::runtime_error +#include // std::accumulate + +// OpenCL +#if defined(__APPLE__) || defined(__MACOSX) + #include +#else + #include +#endif + +namespace clblast { +// ================================================================================================= + +// Error occurred in the C++11 OpenCL header (this file) +inline void Error(const std::string &message) { + throw std::runtime_error("Internal OpenCL error: "+message); +} + +// Error occurred in OpenCL +inline void CheckError(const cl_int status) { + if (status != CL_SUCCESS) { + throw std::runtime_error("Internal OpenCL error: "+std::to_string(status)); + } +} + +// ================================================================================================= + +// C++11 version of 'cl_event' +class Event { + public: + + // Constructor based on the regular OpenCL data-type + explicit Event(const cl_event event): event_(event) { } + + // Regular constructor + explicit Event(): event_(nullptr) { } + + // Waits for completion of this event + void WaitForCompletion() const { + CheckError(clWaitForEvents(1, &event_)); + } + + // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on + // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation: + // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx + float GetElapsedTime() const { + WaitForCompletion(); + auto bytes = size_t{0}; + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); + auto time_start = size_t{0}; + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); + auto time_end = size_t{0}; + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); + return (time_end - time_start) * 1.0e-6f; + } + + // Accessor to the private data-member + cl_event& operator()() { return event_; } + cl_event* pointer() { return &event_; } + private: + cl_event event_; +}; + +// Pointer to an OpenCL event +using EventPointer = cl_event*; + +// ================================================================================================= + +// C++11 version of 'cl_platform_id' +class Platform { + public: + + // Constructor based on the regular OpenCL data-type + explicit Platform(const cl_platform_id platform): platform_(platform) { } + + // Initializes the platform + explicit Platform(const size_t platform_id) { + auto num_platforms = cl_uint{0}; + CheckError(clGetPlatformIDs(0, nullptr, &num_platforms)); + if (num_platforms == 0) { Error("no platforms found"); } + auto platforms = std::vector(num_platforms); + CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr)); + if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); } + platform_ = platforms[platform_id]; + } + + // Returns the number of devices on this platform + size_t NumDevices() const { + auto result = cl_uint{0}; + CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result)); + return static_cast(result); + } + + // Accessor to the private data-member + const cl_platform_id& operator()() const { return platform_; } + private: + cl_platform_id platform_; +}; + +// ================================================================================================= + +// C++11 version of 'cl_device_id' +class Device { + public: + + // Constructor based on the regular OpenCL data-type + explicit Device(const cl_device_id device): device_(device) { } + + // Initialize the device. Note that this constructor can throw exceptions! + explicit Device(const Platform &platform, const size_t device_id) { + auto num_devices = platform.NumDevices(); + if (num_devices == 0) { Error("no devices found"); } + auto devices = std::vector(num_devices); + CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast(num_devices), + devices.data(), nullptr)); + if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); } + device_ = devices[device_id]; + } + + // Methods to retrieve device information + std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); } + std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); } + std::string Name() const { return GetInfoString(CL_DEVICE_NAME); } + std::string Type() const { + auto type = GetInfo(CL_DEVICE_TYPE); + switch(type) { + case CL_DEVICE_TYPE_CPU: return "CPU"; + case CL_DEVICE_TYPE_GPU: return "GPU"; + case CL_DEVICE_TYPE_ACCELERATOR: return "accelerator"; + default: return "default"; + } + } + size_t MaxWorkGroupSize() const { return GetInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE); } + size_t MaxWorkItemDimensions() const { + return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS); + } + std::vector MaxWorkItemSizes() const { + return GetInfoVector(CL_DEVICE_MAX_WORK_ITEM_SIZES); + } + size_t LocalMemSize() const { + return static_cast(GetInfo(CL_DEVICE_LOCAL_MEM_SIZE)); + } + std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); } + size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); } + size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); } + size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); } + size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); } + size_t MemoryClock() const { return 0; } // Not exposed in OpenCL + size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL + + // Configuration-validity checks + bool IsLocalMemoryValid(const size_t local_mem_usage) const { + return (local_mem_usage <= LocalMemSize()); + } + bool IsThreadConfigValid(const std::vector &local) const { + auto local_size = size_t{1}; + for (const auto &item: local) { local_size *= item; } + for (auto i=size_t{0}; i MaxWorkItemSizes()[i]) { return false; } + } + if (local_size > MaxWorkGroupSize()) { return false; } + if (local.size() > MaxWorkItemDimensions()) { return false; } + return true; + } + + // Query for a specific type of device or brand + bool IsCPU() const { return Type() == "CPU"; } + bool IsGPU() const { return Type() == "GPU"; } + bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; } + bool IsARM() const { return Vendor() == "ARM"; } + + // Accessor to the private data-member + const cl_device_id& operator()() const { return device_; } + private: + cl_device_id device_; + + // Private helper functions + template + T GetInfo(const cl_device_info info) const { + auto bytes = size_t{0}; + CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); + auto result = T(0); + CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr)); + return result; + } + size_t GetInfo(const cl_device_info info) const { + auto bytes = size_t{0}; + CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); + auto result = cl_uint(0); + CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr)); + return static_cast(result); + } + template + std::vector GetInfoVector(const cl_device_info info) const { + auto bytes = size_t{0}; + CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); + auto result = std::vector(bytes/sizeof(T)); + CheckError(clGetDeviceInfo(device_, info, bytes, result.data(), nullptr)); + return result; + } + std::string GetInfoString(const cl_device_info info) const { + auto bytes = size_t{0}; + CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr)); + return std::string{result.c_str()}; // Removes any trailing '\0'-characters + } +}; + +// ================================================================================================= + +// C++11 version of 'cl_context' +class Context { + public: + + // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere + explicit Context(const cl_context context): + context_(new cl_context) { + *context_ = context; + } + + // Regular constructor with memory management + explicit Context(const Device &device): + context_(new cl_context, [](cl_context* c) { CheckError(clReleaseContext(*c)); delete c; }) { + auto status = CL_SUCCESS; + const cl_device_id dev = device(); + *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status); + CheckError(status); + } + + // Accessor to the private data-member + const cl_context& operator()() const { return *context_; } + cl_context* pointer() const { return &(*context_); } + private: + std::shared_ptr context_; +}; + +// Pointer to an OpenCL context +using ContextPointer = cl_context*; + +// ================================================================================================= + +// Enumeration of build statuses of the run-time compilation process +enum class BuildStatus { kSuccess, kError, kInvalid }; + +// C++11 version of 'cl_program'. Additionally holds the program's source code. +class Program { + public: + // Note that there is no constructor based on the regular OpenCL data-type because of extra state + + // Source-based constructor with memory management + explicit Program(const Context &context, std::string source): + program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), + length_(source.length()), + source_(std::move(source)), + source_ptr_(&source_[0]) { + auto status = CL_SUCCESS; + *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status); + CheckError(status); + } + + // Binary-based constructor with memory management + explicit Program(const Device &device, const Context &context, const std::string& binary): + program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), + length_(binary.length()), + source_(binary), + source_ptr_(&source_[0]) { + auto status1 = CL_SUCCESS; + auto status2 = CL_SUCCESS; + const cl_device_id dev = device(); + *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_, + reinterpret_cast(&source_ptr_), + &status1, &status2); + CheckError(status1); + CheckError(status2); + } + + // Compiles the device program and returns whether or not there where any warnings/errors + BuildStatus Build(const Device &device, std::vector &options) { + auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "}); + const cl_device_id dev = device(); + auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr); + if (status == CL_BUILD_PROGRAM_FAILURE) { + return BuildStatus::kError; + } + else if (status == CL_INVALID_BINARY) { + return BuildStatus::kInvalid; + } + else { + CheckError(status); + return BuildStatus::kSuccess; + } + } + + // Retrieves the warning/error message from the compiler (if any) + std::string GetBuildInfo(const Device &device) const { + auto bytes = size_t{0}; + auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG}; + CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr)); + return result; + } + + // Retrieves a binary or an intermediate representation of the compiled program + std::string GetIR() const { + auto bytes = size_t{0}; + CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr)); + auto result = std::string{}; + result.resize(bytes); + auto result_ptr = result.data(); + CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr)); + return result; + } + + // Accessor to the private data-member + const cl_program& operator()() const { return *program_; } + private: + std::shared_ptr program_; + size_t length_; + std::string source_; // Note: the source can also be a binary or IR + const char* source_ptr_; +}; + +// ================================================================================================= + +// C++11 version of 'cl_command_queue' +class Queue { + public: + + // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere + explicit Queue(const cl_command_queue queue): + queue_(new cl_command_queue) { + *queue_ = queue; + } + + // Regular constructor with memory management + explicit Queue(const Context &context, const Device &device): + queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s)); + delete s; }) { + auto status = CL_SUCCESS; + #ifdef CL_VERSION_2_0 + cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; + *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status); + #else + *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); + #endif + CheckError(status); + } + + // Synchronizes the queue + void Finish(Event &) const { + Finish(); + } + void Finish() const { + CheckError(clFinish(*queue_)); + } + + // Retrieves the corresponding context or device + Context GetContext() const { + auto bytes = size_t{0}; + CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes)); + cl_context result; + CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr)); + return Context(result); + } + Device GetDevice() const { + auto bytes = size_t{0}; + CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes)); + cl_device_id result; + CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr)); + return Device(result); + } + + // Accessor to the private data-member + const cl_command_queue& operator()() const { return *queue_; } + private: + std::shared_ptr queue_; +}; + +// ================================================================================================= + +// C++11 version of host memory +template +class BufferHost { + public: + + // Regular constructor with memory management + explicit BufferHost(const Context &, const size_t size): + buffer_(new std::vector(size)) { + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + return buffer_->size()*sizeof(T); + } + + // Compatibility with std::vector + size_t size() const { return buffer_->size(); } + T* begin() { return &(*buffer_)[0]; } + T* end() { return &(*buffer_)[buffer_->size()-1]; } + T& operator[](const size_t i) { return (*buffer_)[i]; } + T* data() { return buffer_->data(); } + const T* data() const { return buffer_->data(); } + + private: + std::shared_ptr> buffer_; +}; + +// ================================================================================================= + +// Enumeration of buffer access types +enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }; + +// C++11 version of 'cl_mem' +template +class Buffer { + public: + + // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere + explicit Buffer(const cl_mem buffer): + buffer_(new cl_mem), + access_(BufferAccess::kNotOwned) { + *buffer_ = buffer; + } + + // Regular constructor with memory management. If this class does not own the buffer object, then + // the memory will not be freed automatically afterwards. + explicit Buffer(const Context &context, const BufferAccess access, const size_t size): + buffer_(new cl_mem, [access](cl_mem* m) { + if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); } + delete m; + }), + access_(access) { + auto flags = cl_mem_flags{CL_MEM_READ_WRITE}; + if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; } + if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; } + auto status = CL_SUCCESS; + *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status); + CheckError(status); + } + + // As above, but now with read/write access as a default + explicit Buffer(const Context &context, const size_t size): + Buffer(context, BufferAccess::kReadWrite, size) { + } + + // Constructs a new buffer based on an existing host-container + template + explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end): + Buffer(context, BufferAccess::kReadWrite, static_cast(end - start)) { + auto size = static_cast(end - start); + auto pointer = &*start; + CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0, + nullptr, nullptr)); + queue.Finish(); + } + + // Copies from device to host: reading the device buffer a-synchronously + void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { + if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); } + CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T), + host, 0, nullptr, nullptr)); + } + void ReadAsync(const Queue &queue, const size_t size, std::vector &host, + const size_t offset = 0) const { + if (host.size() < size) { Error("target host buffer is too small"); } + ReadAsync(queue, size, host.data(), offset); + } + void ReadAsync(const Queue &queue, const size_t size, BufferHost &host, + const size_t offset = 0) const { + if (host.size() < size) { Error("target host buffer is too small"); } + ReadAsync(queue, size, host.data(), offset); + } + + // Copies from device to host: reading the device buffer + void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { + ReadAsync(queue, size, host, offset); + queue.Finish(); + } + void Read(const Queue &queue, const size_t size, std::vector &host, + const size_t offset = 0) const { + Read(queue, size, host.data(), offset); + } + void Read(const Queue &queue, const size_t size, BufferHost &host, + const size_t offset = 0) const { + Read(queue, size, host.data(), offset); + } + + // Copies from host to device: writing the device buffer a-synchronously + void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); } + if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); } + CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T), + host, 0, nullptr, nullptr)); + } + void WriteAsync(const Queue &queue, const size_t size, const std::vector &host, + const size_t offset = 0) { + WriteAsync(queue, size, host.data(), offset); + } + void WriteAsync(const Queue &queue, const size_t size, const BufferHost &host, + const size_t offset = 0) { + WriteAsync(queue, size, host.data(), offset); + } + + // Copies from host to device: writing the device buffer + void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + WriteAsync(queue, size, host, offset); + queue.Finish(); + } + void Write(const Queue &queue, const size_t size, const std::vector &host, + const size_t offset = 0) { + Write(queue, size, host.data(), offset); + } + void Write(const Queue &queue, const size_t size, const BufferHost &host, + const size_t offset = 0) { + Write(queue, size, host.data(), offset); + } + + // Copies the contents of this buffer into another device buffer + void CopyToAsync(const Queue &queue, const size_t size, const Buffer &destination) const { + CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0, + nullptr, nullptr)); + } + void CopyTo(const Queue &queue, const size_t size, const Buffer &destination) const { + CopyToAsync(queue, size, destination); + queue.Finish(); + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + auto bytes = size_t{0}; + CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes)); + auto result = size_t{0}; + CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr)); + return result; + } + + // Accessor to the private data-member + const cl_mem& operator()() const { return *buffer_; } + private: + std::shared_ptr buffer_; + const BufferAccess access_; +}; + +// ================================================================================================= + +// C++11 version of 'cl_kernel' +class Kernel { + public: + + // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere + explicit Kernel(const cl_kernel kernel): + kernel_(new cl_kernel) { + *kernel_ = kernel; + } + + // Regular constructor with memory management + explicit Kernel(const Program &program, const std::string &name): + kernel_(new cl_kernel, [](cl_kernel* k) { CheckError(clReleaseKernel(*k)); delete k; }) { + auto status = CL_SUCCESS; + *kernel_ = clCreateKernel(program(), name.c_str(), &status); + CheckError(status); + } + + // Sets a kernel argument at the indicated position + template + void SetArgument(const size_t index, const T &value) { + CheckError(clSetKernelArg(*kernel_, static_cast(index), sizeof(T), &value)); + } + template + void SetArgument(const size_t index, Buffer &value) { + SetArgument(index, value()); + } + + // Sets all arguments in one go using parameter packs. Note that this overwrites previously set + // arguments using 'SetArgument' or 'SetArguments'. + template + void SetArguments(Args&... args) { + SetArgumentsRecursive(0, args...); + } + + // Retrieves the amount of local memory used per work-group for this kernel + size_t LocalMemUsage(const Device &device) const { + auto bytes = size_t{0}; + auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE}; + CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes)); + auto result = size_t{0}; + CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr)); + return result; + } + + // Launches a kernel onto the specified queue + void Launch(const Queue &queue, const std::vector &global, + const std::vector &local, EventPointer event) { + CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), + nullptr, global.data(), local.data(), + 0, nullptr, event)); + } + + // As above, but with an event waiting list + void Launch(const Queue &queue, const std::vector &global, + const std::vector &local, EventPointer event, + std::vector& waitForEvents) { + if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); } + + // Builds a plain version of the events waiting list + auto waitForEventsPlain = std::vector(); + for (auto &waitEvent : waitForEvents) { + waitForEventsPlain.push_back(waitEvent()); + } + + // Launches the kernel while waiting for other events + CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), + nullptr, global.data(), local.data(), + static_cast(waitForEventsPlain.size()), + waitForEventsPlain.data(), + event)); + } + + // As above, but with the default local workgroup size + void Launch(const Queue &queue, const std::vector &global, EventPointer event) { + CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), + nullptr, global.data(), nullptr, + 0, nullptr, event)); + } + + // Accessor to the private data-member + const cl_kernel& operator()() const { return *kernel_; } + private: + std::shared_ptr kernel_; + + // Internal implementation for the recursive SetArguments function. + template + void SetArgumentsRecursive(const size_t index, T &first) { + SetArgument(index, first); + } + template + void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { + SetArgument(index, first); + SetArgumentsRecursive(index+1, args...); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_CLPP11_H_ +#endif diff --git a/src/database.cc b/src/database.cc deleted file mode 100644 index e20ae340..00000000 --- a/src/database.cc +++ /dev/null @@ -1,120 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Database class (see the header for information about the class). -// -// ================================================================================================= - -#include "internal/database.h" -#include "internal/database/xaxpy.h" -#include "internal/database/xdot.h" -#include "internal/database/xgemv.h" -#include "internal/database/xger.h" -#include "internal/database/xgemm.h" -#include "internal/database/copy.h" -#include "internal/database/pad.h" -#include "internal/database/transpose.h" -#include "internal/database/padtranspose.h" - -#include "internal/utilities.h" - -namespace clblast { -// ================================================================================================= - -// Initializes the database -const std::vector Database::database = { - XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble, - XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble, - XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble, - XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble, - XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, - CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, - PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble, - TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble, - PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble -}; - -// ================================================================================================= - -// Constructor, computing device properties and populating the parameter-vector from the database -Database::Database(const Queue &queue, const std::vector &kernels, - const Precision precision): - parameters_{} { - - // Finds information of the current device - auto device = queue.GetDevice(); - auto device_type = device.Type(); - auto device_vendor = device.Vendor(); - auto device_name = device.Name(); - - // Iterates over all kernels to include, and retrieves the parameters for each of them - for (auto &kernel: kernels) { - auto search_result = Search(kernel, device_type, device_vendor, device_name, precision); - parameters_.insert(search_result.begin(), search_result.end()); - } -} - -// ================================================================================================= - -// Returns a list of OpenCL pre-processor defines in string form -std::string Database::GetDefines() const { - std::string defines{}; - for (auto ¶meter: parameters_) { - defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n"; - } - return defines; -} - -// ================================================================================================= - -// Searches the database for the right kernel and precision -Database::Parameters Database::Search(const std::string &this_kernel, - const std::string &this_type, - const std::string &this_vendor, - const std::string &this_device, - const Precision this_precision) const { - // Set the short vendor name - auto this_short_vendor = this_vendor; - for (auto &combination : kVendorNames) { - if (this_vendor == combination.first) { - this_short_vendor = combination.second; - } - } - - // Selects the right kernel - for (auto &db: database) { - if (db.kernel == this_kernel && db.precision == this_precision) { - - // Searches for the right vendor and device type, or selects the default if unavailable. This - // assumes that the default vendor / device type is last in the database. - for (auto &vendor: db.vendors) { - if ((vendor.name == this_short_vendor || vendor.name == kDeviceVendorAll) && - (vendor.type == this_type || vendor.type == kDeviceTypeAll)) { - - // Searches for the right device. If the current device is unavailable, selects the vendor - // default parameters. This assumes the default is last in the database. - for (auto &device: vendor.devices) { - - if (device.name == this_device || device.name == "default") { - - // Sets the parameters accordingly - return device.parameters; - } - } - } - } - } - } - - // If we reached this point, something is wrong - throw std::runtime_error("Database error, could not find a suitable entry"); -} - -// ================================================================================================= -} // namespace clblast diff --git a/src/database/database.cc b/src/database/database.cc new file mode 100644 index 00000000..6ec93731 --- /dev/null +++ b/src/database/database.cc @@ -0,0 +1,120 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Database class (see the header for information about the class). +// +// ================================================================================================= + +#include "utilities.hpp" + +#include "database/database.hpp" +#include "database/kernels/xaxpy.hpp" +#include "database/kernels/xdot.hpp" +#include "database/kernels/xgemv.hpp" +#include "database/kernels/xger.hpp" +#include "database/kernels/xgemm.hpp" +#include "database/kernels/copy.hpp" +#include "database/kernels/pad.hpp" +#include "database/kernels/transpose.hpp" +#include "database/kernels/padtranspose.hpp" + +namespace clblast { +// ================================================================================================= + +// Initializes the database +const std::vector Database::database = { + XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble, + XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble, + XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble, + XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble, + XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, + CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, + PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble, + TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble, + PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble +}; + +// ================================================================================================= + +// Constructor, computing device properties and populating the parameter-vector from the database +Database::Database(const Queue &queue, const std::vector &kernels, + const Precision precision): + parameters_{} { + + // Finds information of the current device + auto device = queue.GetDevice(); + auto device_type = device.Type(); + auto device_vendor = device.Vendor(); + auto device_name = device.Name(); + + // Iterates over all kernels to include, and retrieves the parameters for each of them + for (auto &kernel: kernels) { + auto search_result = Search(kernel, device_type, device_vendor, device_name, precision); + parameters_.insert(search_result.begin(), search_result.end()); + } +} + +// ================================================================================================= + +// Returns a list of OpenCL pre-processor defines in string form +std::string Database::GetDefines() const { + std::string defines{}; + for (auto ¶meter: parameters_) { + defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n"; + } + return defines; +} + +// ================================================================================================= + +// Searches the database for the right kernel and precision +Database::Parameters Database::Search(const std::string &this_kernel, + const std::string &this_type, + const std::string &this_vendor, + const std::string &this_device, + const Precision this_precision) const { + // Set the short vendor name + auto this_short_vendor = this_vendor; + for (auto &combination : kVendorNames) { + if (this_vendor == combination.first) { + this_short_vendor = combination.second; + } + } + + // Selects the right kernel + for (auto &db: database) { + if (db.kernel == this_kernel && db.precision == this_precision) { + + // Searches for the right vendor and device type, or selects the default if unavailable. This + // assumes that the default vendor / device type is last in the database. + for (auto &vendor: db.vendors) { + if ((vendor.name == this_short_vendor || vendor.name == kDeviceVendorAll) && + (vendor.type == this_type || vendor.type == kDeviceTypeAll)) { + + // Searches for the right device. If the current device is unavailable, selects the vendor + // default parameters. This assumes the default is last in the database. + for (auto &device: vendor.devices) { + + if (device.name == this_device || device.name == "default") { + + // Sets the parameters accordingly + return device.parameters; + } + } + } + } + } + } + + // If we reached this point, something is wrong + throw std::runtime_error("Database error, could not find a suitable entry"); +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/database.hpp b/src/database/database.hpp new file mode 100644 index 00000000..0987cbed --- /dev/null +++ b/src/database/database.hpp @@ -0,0 +1,104 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Database class, providing a static variable holding the actual database +// information. The class also provides utility functions to search the database and to access a +// found entry by parameter-key. The database itself is filled in the corresponding source-file and +// partially also by the database/xxxxx.h files, in which kernel-specific parameters are found. +// +// ================================================================================================= + +#ifndef CLBLAST_DATABASE_H_ +#define CLBLAST_DATABASE_H_ + +#include +#include +#include + +#include "utilities.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +class Database { + public: + + // Type alias for the database parameters + using Parameters = std::unordered_map; + + // Structures for content inside the database + struct DatabaseDevice { + const std::string name; + const Parameters parameters; + }; + struct DatabaseVendor { + const std::string type; + const std::string name; + const std::vector devices; + }; + struct DatabaseEntry { + const std::string kernel; + const Precision precision; + const std::vector vendors; + }; + + // The OpenCL device types + static constexpr auto kDeviceTypeCPU = "CPU"; + static constexpr auto kDeviceTypeGPU = "GPU"; + static constexpr auto kDeviceTypeAccelerator = "accelerator"; + static constexpr auto kDeviceTypeAll = "default"; + + // The OpenCL device vendors + static constexpr auto kDeviceVendorAll = "default"; + + // Alternative names for some OpenCL vendors + const std::unordered_map kVendorNames { + {"Intel(R) Corporation", "Intel"}, + {"GenuineIntel", "Intel"}, + {"Advanced Micro Devices, Inc.", "AMD"}, + {"NVIDIA Corporation", "NVIDIA"}, + }; + + // The database consists of separate database entries, stored together in a vector + static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble; + static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble; + static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble; + static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble; + static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble; + static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble; + static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble; + static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble; + static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble; + static const std::vector database; + + // The constructor + explicit Database(const Queue &queue, const std::vector &routines, + const Precision precision); + + // Accessor of values by key + size_t operator[](const std::string key) const { return parameters_.find(key)->second; } + + // Obtain a list of OpenCL pre-processor defines based on the parameters + std::string GetDefines() const; + + private: + Parameters Search(const std::string &this_kernel, const std::string &this_type, + const std::string &this_vendor, const std::string &this_device, + const Precision this_precision) const; + + // Found parameters suitable for this device/kernel + Parameters parameters_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_DATABASE_H_ +#endif diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp new file mode 100644 index 00000000..201e8b8a --- /dev/null +++ b/src/database/kernels/copy.hpp @@ -0,0 +1,262 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the 'Copy' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::CopyHalf = { + "Copy", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::CopySingle = { + "Copy", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, + { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } }, + { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, + { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::CopyComplexSingle = { + "Copy", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, + { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } }, + { "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::CopyDouble = { + "Copy", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } }, + { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::CopyComplexDouble = { + "Copy", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } }, + { "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp new file mode 100644 index 00000000..cc703dd6 --- /dev/null +++ b/src/database/kernels/pad.hpp @@ -0,0 +1,270 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the 'Pad' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::PadHalf = { + "Pad", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadSingle = { + "Pad", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, + { "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, + { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadComplexSingle = { + "Pad", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, + { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, + { "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadDouble = { + "Pad", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 680", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "GeForce GTX 750 Ti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 980", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadComplexDouble = { + "Pad", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp new file mode 100644 index 00000000..f3b1f262 --- /dev/null +++ b/src/database/kernels/padtranspose.hpp @@ -0,0 +1,270 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the 'Padtranspose' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::PadtransposeHalf = { + "Padtranspose", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadtransposeSingle = { + "Padtranspose", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Hawaii", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, + { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, + { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadtransposeComplexSingle = { + "Padtranspose", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadtransposeDouble = { + "Padtranspose", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, + { "GeForce GTX 980", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadtransposeComplexDouble = { + "Padtranspose", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp new file mode 100644 index 00000000..0c893dae --- /dev/null +++ b/src/database/kernels/transpose.hpp @@ -0,0 +1,258 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the 'Transpose' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::TransposeHalf = { + "Transpose", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::TransposeSingle = { + "Transpose", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, + { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, + { "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::TransposeComplexSingle = { + "Transpose", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::TransposeDouble = { + "Transpose", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::TransposeComplexDouble = { + "Transpose", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX 980", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp new file mode 100644 index 00000000..6e6719e8 --- /dev/null +++ b/src/database/kernels/xaxpy.hpp @@ -0,0 +1,270 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the 'Xaxpy' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::XaxpyHalf = { + "Xaxpy", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, + { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XaxpySingle = { + "Xaxpy", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",2} } }, + { "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, + { "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, + { "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",1}, {"WGS",512}, {"WPT",2} } }, + { "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",1024}, {"WPT",2} } }, + { "default", { {"VW",2}, {"WGS",1024}, {"WPT",2} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 480", { {"VW",4}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 680", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, + { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, + { "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, + { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } }, + { "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XaxpyComplexSingle = { + "Xaxpy", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } }, + { "Hawaii", { {"VW",1}, {"WGS",128}, {"WPT",2} } }, + { "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS",1024}, {"WPT",2} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",2}, {"WGS",512}, {"WPT",1} } }, + { "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, + { "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, + { "GeForce GTX 480", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "GeForce GTX 680", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, + { "GeForce GTX 980", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, + { "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XaxpyDouble = { + "Xaxpy", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "Hawaii", { {"VW",1}, {"WGS",64}, {"WPT",2} } }, + { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"VW",2}, {"WGS",128}, {"WPT",2} } }, + { "default", { {"VW",2}, {"WGS",128}, {"WPT",2} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",64}, {"WPT",1} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",8}, {"WGS",2048}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",512}, {"WPT",1} } }, + { "default", { {"VW",2}, {"WGS",512}, {"WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 480", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } }, + { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, + { "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, + { "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XaxpyComplexDouble = { + "Xaxpy", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"VW",1}, {"WGS",64}, {"WPT",8} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",8} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",8}, {"WGS",128}, {"WPT",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } }, + { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, + { "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } }, + { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, + { "Tesla K20m", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "Tesla K40m", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp new file mode 100644 index 00000000..d09d8c62 --- /dev/null +++ b/src/database/kernels/xdot.hpp @@ -0,0 +1,200 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the 'Xdot' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::XdotHalf = { + "Xdot", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } }, + { "default", { {"WGS1",32}, {"WGS2",32} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",32}, {"WGS2",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XdotSingle = { + "Xdot", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } }, + { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, + { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, + { "Tahiti", { {"WGS1",128}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } }, + { "default", { {"WGS1",1024}, {"WGS2",32} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",32} } }, + { "Iris Pro", { {"WGS1",512}, {"WGS2",64} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"WGS1",128}, {"WGS2",32} } }, + { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, + { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } }, + { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } }, + { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, + { "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",64}, {"WGS2",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XdotComplexSingle = { + "Xdot", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, + { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, + { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, + { "Tahiti", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } }, + { "default", { {"WGS1",1024}, {"WGS2",32} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } }, + { "Iris Pro", { {"WGS1",32}, {"WGS2",32} } }, + { "default", { {"WGS1",32}, {"WGS2",32} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"WGS1",64}, {"WGS2",32} } }, + { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, + { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } }, + { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } }, + { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, + { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",32}, {"WGS2",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XdotDouble = { + "Xdot", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } }, + { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, + { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, + { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",64} } }, + { "default", { {"WGS1",512}, {"WGS2",64} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"WGS1",128}, {"WGS2",32} } }, + { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, + { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } }, + { "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } }, + { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, + { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",64}, {"WGS2",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XdotComplexDouble = { + "Xdot", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, + { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, + { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, + { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } }, + { "default", { {"WGS1",1024}, {"WGS2",32} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"WGS1",64}, {"WGS2",32} } }, + { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, + { "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } }, + { "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } }, + { "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } }, + { "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",64}, {"WGS2",32} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp new file mode 100644 index 00000000..f35d2c88 --- /dev/null +++ b/src/database/kernels/xgemm.hpp @@ -0,0 +1,263 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the 'Xgemm' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmHalf = { + "Xgemm", Precision::kHalf, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmSingle = { + "Xgemm", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } }, + { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, + { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, + { "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, + { "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, + { "default", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, + { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, + { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } }, + { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } }, + { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, + { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } }, + { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, + { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmComplexSingle = { + "Xgemm", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "Hawaii", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, + { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, + { "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, + { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, + { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, + { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmDouble = { + "Xgemm", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, + { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, + { "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, + { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmComplexDouble = { + "Xgemm", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, + { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp new file mode 100644 index 00000000..6b76c8ac --- /dev/null +++ b/src/database/kernels/xgemv.hpp @@ -0,0 +1,231 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the 'Xgemv' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvHalf = { + "Xgemv", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvSingle = { + "Xgemv", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",4} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } }, + { "Iris", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } }, + { "Iris Pro", { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"WGS1",256}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, + { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, + { "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } }, + { "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } }, + { "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, + { "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "Tesla K20m", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, + { "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvComplexSingle = { + "Xgemv", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Pitcairn", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, + { "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } }, + { "Iris", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Iris Pro", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvDouble = { + "Xgemv", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, + { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",2} } }, + { "default", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } }, + { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } }, + { "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, + { "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "Tesla K20m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvComplexDouble = { + "Xgemv", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, + { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp new file mode 100644 index 00000000..f2e0a36f --- /dev/null +++ b/src/database/kernels/xger.hpp @@ -0,0 +1,220 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the 'Xger' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::XgerHalf = { + "Xger", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgerSingle = { + "Xger", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, + { "Hawaii", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, + { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + { "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } }, + { "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, + { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } }, + { "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } }, + { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",4} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, + { "GeForce GTX 480", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, + { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, + { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, + { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgerComplexSingle = { + "Xger", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, + { "Hawaii", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, + { "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, + { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } }, + { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } }, + { "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } }, + { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } }, + { "GeForce GTX 480", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } }, + { "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, + { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, + { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgerDouble = { + "Xger", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, + { "Hawaii", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, + { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, + { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } }, + { "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } }, + { "GeForce GTX 480", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, + { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } }, + { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, + { "default", { {"WGS1",16}, {"WGS2",4}, {"WPT",2} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgerComplexDouble = { + "Xger", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + { "Hawaii", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } }, + { "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, + { "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, + { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, + } + }, + { // ARM GPUs + kDeviceTypeGPU, "ARM", { + { "Mali-T628", { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } }, + { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } }, + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, + { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, + { "GeForce GTX 480", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } }, + { "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } }, + { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, + { "default", { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/public_api.hpp b/src/public_api.hpp new file mode 100644 index 00000000..d0732297 --- /dev/null +++ b/src/public_api.hpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file provides macro's to define the public API. This is needed when building a Windows DLL. +// Note: this is only used for the C++ interface, the C interface has its own definition included in +// the header file itself. +// +// ================================================================================================= + +#ifndef CLBLAST_PUBLIC_API_H_ +#define CLBLAST_PUBLIC_API_H_ + +namespace clblast { +// ================================================================================================= + +// Exports library functions under Windows when building a DLL. See also: +// https://msdn.microsoft.com/en-us/library/a90k134d.aspx +#ifdef _WIN32 + #define PUBLIC_API __declspec(dllexport) +#else + #define PUBLIC_API +#endif + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_PUBLIC_API_H_ +#endif diff --git a/src/routine.cc b/src/routine.cc index 11633ede..d3590896 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -14,7 +14,7 @@ #include #include -#include "internal/routine.h" +#include "routine.hpp" namespace clblast { // ================================================================================================= diff --git a/src/routine.hpp b/src/routine.hpp new file mode 100644 index 00000000..54b5779f --- /dev/null +++ b/src/routine.hpp @@ -0,0 +1,68 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements all the basic functionality for the BLAS routines. This class serves as a +// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as +// compiling the OpenCL kernel, connecting to the database, etc. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINE_H_ +#define CLBLAST_ROUTINE_H_ + +#include +#include + +#include "utilities.hpp" +#include "cache.hpp" +#include "buffer_test.hpp" +#include "database/database.hpp" +#include "routines/common.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +class Routine { + public: + + // Base class constructor + explicit Routine(Queue &queue, EventPointer event, const std::string &name, + const std::vector &routines, const Precision precision); + + // Set-up phase of the kernel + StatusCode SetUp(); + + protected: + + // Non-static variable for the precision + const Precision precision_; + + // The routine's name and its kernel-source in string form + const std::string routine_name_; + std::string source_string_; + + // The OpenCL objects, accessible only from derived classes + Queue queue_; + EventPointer event_; + const Context context_; + const Device device_; + + // OpenCL device properties + const std::string device_name_; + + // Connection to the database for all the device-specific parameters + const Database db_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINE_H_ +#endif diff --git a/src/routines/common.cc b/src/routines/common.cc index 561a1bd8..c378df28 100644 --- a/src/routines/common.cc +++ b/src/routines/common.cc @@ -13,7 +13,7 @@ #include -#include "internal/routines/common.h" +#include "routines/common.hpp" namespace clblast { // ================================================================================================= diff --git a/src/routines/common.hpp b/src/routines/common.hpp new file mode 100644 index 00000000..c99cd39d --- /dev/null +++ b/src/routines/common.hpp @@ -0,0 +1,173 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains all the interfaces to common kernels, such as copying, padding, and +// transposing a matrix. These functions are templated and thus header-only. This file also contains +// other common functions to routines, such as a function to launch a kernel. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_COMMON_H_ +#define CLBLAST_ROUTINES_COMMON_H_ + +#include +#include + +#include "clblast.h" +#include "clpp11.hpp" +#include "database/database.hpp" + +namespace clblast { +// ================================================================================================= + +// Enqueues a kernel, waits for completion, and checks for errors +StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, + std::vector global, const std::vector &local, + EventPointer event, std::vector& waitForEvents); + +// As above, but without an event waiting list +StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, + std::vector global, const std::vector &local, + EventPointer event); + +// ================================================================================================= + +// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able +// to write to symmetric and triangular matrices through optional arguments. +template +StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context, + const Database &db, + EventPointer event, std::vector& waitForEvents, + const size_t src_one, const size_t src_two, + const size_t src_ld, const size_t src_offset, + const Buffer &src, + const size_t dest_one, const size_t dest_two, + const size_t dest_ld, const size_t dest_offset, + const Buffer &dest, + const T alpha, + const Program &program, const bool do_pad, + const bool do_transpose, const bool do_conjugate, + const bool upper = false, const bool lower = false, + const bool diagonal_imag_zero = false) { + + // Determines whether or not the fast-version could potentially be used + auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && + (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && + (upper == false) && (lower == false) && (diagonal_imag_zero == false); + + // Determines the right kernel + auto kernel_name = std::string{}; + if (do_transpose) { + if (use_fast_kernel && + IsMultiple(src_ld, db["TRA_WPT"]) && + IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) && + IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) { + kernel_name = "TransposeMatrixFast"; + } + else { + use_fast_kernel = false; + kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix"; + } + } + else { + if (use_fast_kernel && + IsMultiple(src_ld, db["COPY_VW"]) && + IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) && + IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) { + kernel_name = "CopyMatrixFast"; + } + else { + use_fast_kernel = false; + kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix"; + } + } + + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer(context, 1); + alpha_buffer.Write(queue, 1, &alpha); + + // Retrieves the kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(src_ld)); + kernel.SetArgument(1, src()); + kernel.SetArgument(2, dest()); + kernel.SetArgument(3, alpha_buffer()); + } + else { + kernel.SetArgument(0, static_cast(src_one)); + kernel.SetArgument(1, static_cast(src_two)); + kernel.SetArgument(2, static_cast(src_ld)); + kernel.SetArgument(3, static_cast(src_offset)); + kernel.SetArgument(4, src()); + kernel.SetArgument(5, static_cast(dest_one)); + kernel.SetArgument(6, static_cast(dest_two)); + kernel.SetArgument(7, static_cast(dest_ld)); + kernel.SetArgument(8, static_cast(dest_offset)); + kernel.SetArgument(9, dest()); + kernel.SetArgument(10, alpha_buffer()); + if (do_pad) { + kernel.SetArgument(11, static_cast(do_conjugate)); + } + else { + kernel.SetArgument(11, static_cast(upper)); + kernel.SetArgument(12, static_cast(lower)); + kernel.SetArgument(13, static_cast(diagonal_imag_zero)); + } + } + + // Launches the kernel and returns the error code. Uses global and local thread sizes based on + // parameters in the database. + if (do_transpose) { + if (use_fast_kernel) { + const auto global = std::vector{ + dest_one / db["TRA_WPT"], + dest_two / db["TRA_WPT"] + }; + const auto local = std::vector{db["TRA_DIM"], db["TRA_DIM"]}; + return RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + else { + const auto global = std::vector{ + Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), + Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]) + }; + const auto local = std::vector{db["PADTRA_TILE"], db["PADTRA_TILE"]}; + return RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + } + else { + if (use_fast_kernel) { + const auto global = std::vector{ + dest_one / db["COPY_VW"], + dest_two / db["COPY_WPT"] + }; + const auto local = std::vector{db["COPY_DIMX"], db["COPY_DIMY"]}; + return RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + else { + const auto global = std::vector{ + Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), + Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]) + }; + const auto local = std::vector{db["PAD_DIMX"], db["PAD_DIMY"]}; + return RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + } + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_COMMON_H_ +#endif diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc index b4add2a3..6b6e7f9e 100644 --- a/src/routines/level1/xamax.cc +++ b/src/routines/level1/xamax.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level1/xamax.h" +#include "routines/level1/xamax.hpp" #include #include diff --git a/src/routines/level1/xamax.hpp b/src/routines/level1/xamax.hpp new file mode 100644 index 00000000..aa45a8e4 --- /dev/null +++ b/src/routines/level1/xamax.hpp @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xamax routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XAMAX_H_ +#define CLBLAST_ROUTINES_XAMAX_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xamax: public Routine { + public: + + // Constructor + Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX"); + + // Templated-precision implementation of the routine + StatusCode DoAmax(const size_t n, + const Buffer &imax_buffer, const size_t imax_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XAMAX_H_ +#endif diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cc index 80f04829..0c1ce903 100644 --- a/src/routines/level1/xasum.cc +++ b/src/routines/level1/xasum.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level1/xasum.h" +#include "routines/level1/xasum.hpp" #include #include diff --git a/src/routines/level1/xasum.hpp b/src/routines/level1/xasum.hpp new file mode 100644 index 00000000..5a253f4d --- /dev/null +++ b/src/routines/level1/xasum.hpp @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xasum routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XASUM_H_ +#define CLBLAST_ROUTINES_XASUM_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xasum: public Routine { + public: + + // Constructor + Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM"); + + // Templated-precision implementation of the routine + StatusCode DoAsum(const size_t n, + const Buffer &asum_buffer, const size_t asum_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XASUM_H_ +#endif diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index 4a548757..5b6c9e77 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level1/xaxpy.h" +#include "routines/level1/xaxpy.hpp" #include #include diff --git a/src/routines/level1/xaxpy.hpp b/src/routines/level1/xaxpy.hpp new file mode 100644 index 00000000..caac871e --- /dev/null +++ b/src/routines/level1/xaxpy.hpp @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xaxpy routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XAXPY_H_ +#define CLBLAST_ROUTINES_XAXPY_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xaxpy: public Routine { + public: + + // Constructor + Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY"); + + // Templated-precision implementation of the routine + StatusCode DoAxpy(const size_t n, const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XAXPY_H_ +#endif diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc index 92d31786..673ef349 100644 --- a/src/routines/level1/xcopy.cc +++ b/src/routines/level1/xcopy.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level1/xcopy.h" +#include "routines/level1/xcopy.hpp" #include #include diff --git a/src/routines/level1/xcopy.hpp b/src/routines/level1/xcopy.hpp new file mode 100644 index 00000000..0c424ba3 --- /dev/null +++ b/src/routines/level1/xcopy.hpp @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xcopy routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XCOPY_H_ +#define CLBLAST_ROUTINES_XCOPY_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xcopy: public Routine { + public: + + // Constructor + Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY"); + + // Templated-precision implementation of the routine + StatusCode DoCopy(const size_t n, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XCOPY_H_ +#endif diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc index 8709c541..bafea157 100644 --- a/src/routines/level1/xdot.cc +++ b/src/routines/level1/xdot.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level1/xdot.h" +#include "routines/level1/xdot.hpp" #include #include diff --git a/src/routines/level1/xdot.hpp b/src/routines/level1/xdot.hpp new file mode 100644 index 00000000..02c1efaa --- /dev/null +++ b/src/routines/level1/xdot.hpp @@ -0,0 +1,42 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xdot routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XDOT_H_ +#define CLBLAST_ROUTINES_XDOT_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xdot: public Routine { + public: + + // Constructor + Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT"); + + // Templated-precision implementation of the routine + StatusCode DoDot(const size_t n, + const Buffer &dot_buffer, const size_t dot_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const bool do_conjugate = false); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XDOT_H_ +#endif diff --git a/src/routines/level1/xdotc.cc b/src/routines/level1/xdotc.cc index b3a01079..27cf2bab 100644 --- a/src/routines/level1/xdotc.cc +++ b/src/routines/level1/xdotc.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level1/xdotc.h" +#include "routines/level1/xdotc.hpp" #include #include diff --git a/src/routines/level1/xdotc.hpp b/src/routines/level1/xdotc.hpp new file mode 100644 index 00000000..b8cbdaf5 --- /dev/null +++ b/src/routines/level1/xdotc.hpp @@ -0,0 +1,44 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xdotc routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XDOTC_H_ +#define CLBLAST_ROUTINES_XDOTC_H_ + +#include "routines/level1/xdot.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xdotc: public Xdot { + public: + + // Uses the regular Xdot routine + using Xdot::DoDot; + + // Constructor + Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC"); + + // Templated-precision implementation of the routine + StatusCode DoDotc(const size_t n, + const Buffer &dot_buffer, const size_t dot_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XDOTC_H_ +#endif diff --git a/src/routines/level1/xdotu.cc b/src/routines/level1/xdotu.cc index 8dded6e0..0bce70b7 100644 --- a/src/routines/level1/xdotu.cc +++ b/src/routines/level1/xdotu.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level1/xdotu.h" +#include "routines/level1/xdotu.hpp" #include diff --git a/src/routines/level1/xdotu.hpp b/src/routines/level1/xdotu.hpp new file mode 100644 index 00000000..b3f73086 --- /dev/null +++ b/src/routines/level1/xdotu.hpp @@ -0,0 +1,44 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xdotu routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XDOTU_H_ +#define CLBLAST_ROUTINES_XDOTU_H_ + +#include "routines/level1/xdot.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xdotu: public Xdot { + public: + + // Uses the regular Xdot routine + using Xdot::DoDot; + + // Constructor + Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU"); + + // Templated-precision implementation of the routine + StatusCode DoDotu(const size_t n, + const Buffer &dot_buffer, const size_t dot_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XDOTU_H_ +#endif diff --git a/src/routines/level1/xmax.hpp b/src/routines/level1/xmax.hpp new file mode 100644 index 00000000..5a0236f2 --- /dev/null +++ b/src/routines/level1/xmax.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xmax routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XMAX_H_ +#define CLBLAST_ROUTINES_XMAX_H_ + +#include "routine.hpp" +#include "routines/level1/xamax.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xmax: public Xamax { + public: + + // Members and methods from the base class + using Xamax::DoAmax; + + // Constructor + Xmax(Queue &queue, EventPointer event, const std::string &name = "MAX"): + Xamax(queue, event, name) { + } + + // Forwards to the regular absolute version. The implementation difference is realised in the + // kernel through a pre-processor macro based on the name of the routine. + StatusCode DoMax(const size_t n, + const Buffer &imax_buffer, const size_t imax_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XMAX_H_ +#endif diff --git a/src/routines/level1/xmin.hpp b/src/routines/level1/xmin.hpp new file mode 100644 index 00000000..6befec64 --- /dev/null +++ b/src/routines/level1/xmin.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xmin routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XMIN_H_ +#define CLBLAST_ROUTINES_XMIN_H_ + +#include "routine.hpp" +#include "routines/level1/xamax.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xmin: public Xamax { + public: + + // Members and methods from the base class + using Xamax::DoAmax; + + // Constructor + Xmin(Queue &queue, EventPointer event, const std::string &name = "MIN"): + Xamax(queue, event, name) { + } + + // Forwards to the regular max-absolute version. The implementation difference is realised in the + // kernel through a pre-processor macro based on the name of the routine. + StatusCode DoMin(const size_t n, + const Buffer &imin_buffer, const size_t imin_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XMIN_H_ +#endif diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc index 105f991c..97615d8b 100644 --- a/src/routines/level1/xnrm2.cc +++ b/src/routines/level1/xnrm2.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level1/xnrm2.h" +#include "routines/level1/xnrm2.hpp" #include #include diff --git a/src/routines/level1/xnrm2.hpp b/src/routines/level1/xnrm2.hpp new file mode 100644 index 00000000..7baf07f5 --- /dev/null +++ b/src/routines/level1/xnrm2.hpp @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xnrm2 routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XNRM2_H_ +#define CLBLAST_ROUTINES_XNRM2_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xnrm2: public Routine { + public: + + // Constructor + Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2"); + + // Templated-precision implementation of the routine + StatusCode DoNrm2(const size_t n, + const Buffer &nrm2_buffer, const size_t nrm2_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XNRM2_H_ +#endif diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc index 3c1b5257..bcc43c3b 100644 --- a/src/routines/level1/xscal.cc +++ b/src/routines/level1/xscal.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level1/xscal.h" +#include "routines/level1/xscal.hpp" #include #include diff --git a/src/routines/level1/xscal.hpp b/src/routines/level1/xscal.hpp new file mode 100644 index 00000000..6c585cb2 --- /dev/null +++ b/src/routines/level1/xscal.hpp @@ -0,0 +1,39 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xscal routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSCAL_H_ +#define CLBLAST_ROUTINES_XSCAL_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xscal: public Routine { + public: + + // Constructor + Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL"); + + // Templated-precision implementation of the routine + StatusCode DoScal(const size_t n, const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSCAL_H_ +#endif diff --git a/src/routines/level1/xsum.hpp b/src/routines/level1/xsum.hpp new file mode 100644 index 00000000..84e20bea --- /dev/null +++ b/src/routines/level1/xsum.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsum routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSUM_H_ +#define CLBLAST_ROUTINES_XSUM_H_ + +#include "routine.hpp" +#include "routines/level1/xasum.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsum: public Xasum { + public: + + // Members and methods from the base class + using Xasum::DoAsum; + + // Constructor + Xsum(Queue &queue, EventPointer event, const std::string &name = "SUM"): + Xasum(queue, event, name) { + } + + // Forwards to the regular absolute version. The implementation difference is realised in the + // kernel through a pre-processor macro based on the name of the routine. + StatusCode DoSum(const size_t n, + const Buffer &sum_buffer, const size_t sum_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSUM_H_ +#endif diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc index 27eb9b13..03907cbd 100644 --- a/src/routines/level1/xswap.cc +++ b/src/routines/level1/xswap.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level1/xswap.h" +#include "routines/level1/xswap.hpp" #include #include diff --git a/src/routines/level1/xswap.hpp b/src/routines/level1/xswap.hpp new file mode 100644 index 00000000..4f9ea36d --- /dev/null +++ b/src/routines/level1/xswap.hpp @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xswap routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSWAP_H_ +#define CLBLAST_ROUTINES_XSWAP_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xswap: public Routine { + public: + + // Constructor + Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP"); + + // Templated-precision implementation of the routine + StatusCode DoSwap(const size_t n, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSWAP_H_ +#endif diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc index 7a30c34a..ea4f001c 100644 --- a/src/routines/level2/xgbmv.cc +++ b/src/routines/level2/xgbmv.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xgbmv.h" +#include "routines/level2/xgbmv.hpp" #include #include diff --git a/src/routines/level2/xgbmv.hpp b/src/routines/level2/xgbmv.hpp new file mode 100644 index 00000000..686ab642 --- /dev/null +++ b/src/routines/level2/xgbmv.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgbmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XGBMV_H_ +#define CLBLAST_ROUTINES_XGBMV_H_ + +#include "routines/level2/xgemv.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xgbmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV"); + + // Templated-precision implementation of the routine + StatusCode DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XGBMV_H_ +#endif diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index ccadd131..21fb397c 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xgemv.h" +#include "routines/level2/xgemv.hpp" #include #include diff --git a/src/routines/level2/xgemv.hpp b/src/routines/level2/xgemv.hpp new file mode 100644 index 00000000..e9afec8d --- /dev/null +++ b/src/routines/level2/xgemv.hpp @@ -0,0 +1,56 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgemv routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XGEMV_H_ +#define CLBLAST_ROUTINES_XGEMV_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xgemv: public Routine { + public: + + // Constructor + Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV"); + + // Templated-precision implementation of the routine + StatusCode DoGemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); + + // Generic version used also for other matrix-vector multiplications + StatusCode MatVec(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + bool fast_kernel, bool fast_kernel_rot, + const size_t parameter, const bool packed, + const size_t kl, const size_t ku); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XGEMV_H_ +#endif diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc index 6ceaa00e..353047d2 100644 --- a/src/routines/level2/xger.cc +++ b/src/routines/level2/xger.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xger.h" +#include "routines/level2/xger.hpp" #include #include diff --git a/src/routines/level2/xger.hpp b/src/routines/level2/xger.hpp new file mode 100644 index 00000000..3c6abe44 --- /dev/null +++ b/src/routines/level2/xger.hpp @@ -0,0 +1,43 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xger routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XGER_H_ +#define CLBLAST_ROUTINES_XGER_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xger: public Routine { + public: + + // Constructor + Xger(Queue &queue, EventPointer event, const std::string &name = "GER"); + + // Templated-precision implementation of the routine + StatusCode DoGer(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XGER_H_ +#endif diff --git a/src/routines/level2/xgerc.cc b/src/routines/level2/xgerc.cc index 73284b52..d9feda97 100644 --- a/src/routines/level2/xgerc.cc +++ b/src/routines/level2/xgerc.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xgerc.h" +#include "routines/level2/xgerc.hpp" #include diff --git a/src/routines/level2/xgerc.hpp b/src/routines/level2/xgerc.hpp new file mode 100644 index 00000000..f1d04dfd --- /dev/null +++ b/src/routines/level2/xgerc.hpp @@ -0,0 +1,46 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgerc routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XGERC_H_ +#define CLBLAST_ROUTINES_XGERC_H_ + +#include "routines/level2/xger.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xgerc: public Xger { + public: + + // Uses the regular Xger routine + using Xger::DoGer; + + // Constructor + Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC"); + + // Templated-precision implementation of the routine + StatusCode DoGerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XGERC_H_ +#endif diff --git a/src/routines/level2/xgeru.cc b/src/routines/level2/xgeru.cc index 7730d6a5..da9e91c2 100644 --- a/src/routines/level2/xgeru.cc +++ b/src/routines/level2/xgeru.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xgeru.h" +#include "routines/level2/xgeru.hpp" #include diff --git a/src/routines/level2/xgeru.hpp b/src/routines/level2/xgeru.hpp new file mode 100644 index 00000000..fb50e917 --- /dev/null +++ b/src/routines/level2/xgeru.hpp @@ -0,0 +1,46 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgeru routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XGERU_H_ +#define CLBLAST_ROUTINES_XGERU_H_ + +#include "routines/level2/xger.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xgeru: public Xger { + public: + + // Uses the regular Xger routine + using Xger::DoGer; + + // Constructor + Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU"); + + // Templated-precision implementation of the routine + StatusCode DoGeru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XGERU_H_ +#endif diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc index 58591b50..f6c0e3c4 100644 --- a/src/routines/level2/xhbmv.cc +++ b/src/routines/level2/xhbmv.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xhbmv.h" +#include "routines/level2/xhbmv.hpp" #include #include diff --git a/src/routines/level2/xhbmv.hpp b/src/routines/level2/xhbmv.hpp new file mode 100644 index 00000000..d668eb88 --- /dev/null +++ b/src/routines/level2/xhbmv.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhbmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xhbmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHBMV_H_ +#define CLBLAST_ROUTINES_XHBMV_H_ + +#include "routines/level2/xgemv.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xhbmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV"); + + // Templated-precision implementation of the routine + StatusCode DoHbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHBMV_H_ +#endif diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc index b4ef0fa4..2cbcf7b4 100644 --- a/src/routines/level2/xhemv.cc +++ b/src/routines/level2/xhemv.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xhemv.h" +#include "routines/level2/xhemv.hpp" #include #include diff --git a/src/routines/level2/xhemv.hpp b/src/routines/level2/xhemv.hpp new file mode 100644 index 00000000..8e062fd3 --- /dev/null +++ b/src/routines/level2/xhemv.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhemv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHEMV_H_ +#define CLBLAST_ROUTINES_XHEMV_H_ + +#include "routines/level2/xgemv.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xhemv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV"); + + // Templated-precision implementation of the routine + StatusCode DoHemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHEMV_H_ +#endif diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc index 939e17bb..ed8ba9e9 100644 --- a/src/routines/level2/xher.cc +++ b/src/routines/level2/xher.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xher.h" +#include "routines/level2/xher.hpp" #include diff --git a/src/routines/level2/xher.hpp b/src/routines/level2/xher.hpp new file mode 100644 index 00000000..9ff6bf3f --- /dev/null +++ b/src/routines/level2/xher.hpp @@ -0,0 +1,46 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xher routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHER_H_ +#define CLBLAST_ROUTINES_XHER_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xher: public Routine { + public: + + // Constructor + Xher(Queue &queue, EventPointer event, const std::string &name = "HER"); + + // Translates alpha of type 'U' into type 'T' + T GetAlpha(const U alpha); + + // Templated-precision implementation of the routine + StatusCode DoHer(const Layout layout, const Triangle triangle, + const size_t n, + const U alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const bool packed = false); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHER_H_ +#endif diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc index 95dbd87a..50572cea 100644 --- a/src/routines/level2/xher2.cc +++ b/src/routines/level2/xher2.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xher2.h" +#include "routines/level2/xher2.hpp" #include diff --git a/src/routines/level2/xher2.hpp b/src/routines/level2/xher2.hpp new file mode 100644 index 00000000..8c53c047 --- /dev/null +++ b/src/routines/level2/xher2.hpp @@ -0,0 +1,44 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xher2 routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHER2_H_ +#define CLBLAST_ROUTINES_XHER2_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xher2: public Routine { + public: + + // Constructor + Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2"); + + // Templated-precision implementation of the routine + StatusCode DoHer2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const bool packed = false); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHER2_H_ +#endif diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc index 92686dbe..e6f82b34 100644 --- a/src/routines/level2/xhpmv.cc +++ b/src/routines/level2/xhpmv.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xhpmv.h" +#include "routines/level2/xhpmv.hpp" #include #include diff --git a/src/routines/level2/xhpmv.hpp b/src/routines/level2/xhpmv.hpp new file mode 100644 index 00000000..b11192f9 --- /dev/null +++ b/src/routines/level2/xhpmv.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhpmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xhpmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHPMV_H_ +#define CLBLAST_ROUTINES_XHPMV_H_ + +#include "routines/level2/xgemv.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xhpmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV"); + + // Templated-precision implementation of the routine + StatusCode DoHpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &ap_buffer, const size_t ap_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHPMV_H_ +#endif diff --git a/src/routines/level2/xhpr.cc b/src/routines/level2/xhpr.cc index 4b31ad09..225ebfe5 100644 --- a/src/routines/level2/xhpr.cc +++ b/src/routines/level2/xhpr.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xhpr.h" +#include "routines/level2/xhpr.hpp" #include diff --git a/src/routines/level2/xhpr.hpp b/src/routines/level2/xhpr.hpp new file mode 100644 index 00000000..37801c68 --- /dev/null +++ b/src/routines/level2/xhpr.hpp @@ -0,0 +1,45 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhpr routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHPR_H_ +#define CLBLAST_ROUTINES_XHPR_H_ + +#include "routines/level2/xher.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xhpr: public Xher { + public: + + // Uses the regular Xher routine + using Xher::DoHer; + + // Constructor + Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR"); + + // Templated-precision implementation of the routine + StatusCode DoHpr(const Layout layout, const Triangle triangle, + const size_t n, + const U alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &ap_buffer, const size_t ap_offset); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHPR_H_ +#endif diff --git a/src/routines/level2/xhpr2.cc b/src/routines/level2/xhpr2.cc index 9be24f43..85f9d3f9 100644 --- a/src/routines/level2/xhpr2.cc +++ b/src/routines/level2/xhpr2.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xhpr2.h" +#include "routines/level2/xhpr2.hpp" #include diff --git a/src/routines/level2/xhpr2.hpp b/src/routines/level2/xhpr2.hpp new file mode 100644 index 00000000..d66dce55 --- /dev/null +++ b/src/routines/level2/xhpr2.hpp @@ -0,0 +1,46 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhpr2 routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHPR2_H_ +#define CLBLAST_ROUTINES_XHPR2_H_ + +#include "routines/level2/xher2.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xhpr2: public Xher2 { + public: + + // Uses the regular Xher2 routine + using Xher2::DoHer2; + + // Constructor + Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2"); + + // Templated-precision implementation of the routine + StatusCode DoHpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &ap_buffer, const size_t ap_offset); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHPR2_H_ +#endif diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc index 66ba74e8..28730899 100644 --- a/src/routines/level2/xsbmv.cc +++ b/src/routines/level2/xsbmv.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xsbmv.h" +#include "routines/level2/xsbmv.hpp" #include #include diff --git a/src/routines/level2/xsbmv.hpp b/src/routines/level2/xsbmv.hpp new file mode 100644 index 00000000..16c5e9a8 --- /dev/null +++ b/src/routines/level2/xsbmv.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsbmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xsbmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSBMV_H_ +#define CLBLAST_ROUTINES_XSBMV_H_ + +#include "routines/level2/xgemv.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsbmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV"); + + // Templated-precision implementation of the routine + StatusCode DoSbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSBMV_H_ +#endif diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc index 589a97d4..f6651012 100644 --- a/src/routines/level2/xspmv.cc +++ b/src/routines/level2/xspmv.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xspmv.h" +#include "routines/level2/xspmv.hpp" #include #include diff --git a/src/routines/level2/xspmv.hpp b/src/routines/level2/xspmv.hpp new file mode 100644 index 00000000..a0c69b85 --- /dev/null +++ b/src/routines/level2/xspmv.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xspmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xspmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSPMV_H_ +#define CLBLAST_ROUTINES_XSPMV_H_ + +#include "routines/level2/xgemv.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xspmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV"); + + // Templated-precision implementation of the routine + StatusCode DoSpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &ap_buffer, const size_t ap_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSPMV_H_ +#endif diff --git a/src/routines/level2/xspr.cc b/src/routines/level2/xspr.cc index c556b920..a75fe9c3 100644 --- a/src/routines/level2/xspr.cc +++ b/src/routines/level2/xspr.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xspr.h" +#include "routines/level2/xspr.hpp" #include diff --git a/src/routines/level2/xspr.hpp b/src/routines/level2/xspr.hpp new file mode 100644 index 00000000..6468c736 --- /dev/null +++ b/src/routines/level2/xspr.hpp @@ -0,0 +1,45 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xspr routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSPR_H_ +#define CLBLAST_ROUTINES_XSPR_H_ + +#include "routines/level2/xher.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xspr: public Xher { + public: + + // Uses the regular Xher routine + using Xher::DoHer; + + // Constructor + Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR"); + + // Templated-precision implementation of the routine + StatusCode DoSpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &ap_buffer, const size_t ap_offset); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSPR_H_ +#endif diff --git a/src/routines/level2/xspr2.cc b/src/routines/level2/xspr2.cc index c4ad5dc4..c39a2eb4 100644 --- a/src/routines/level2/xspr2.cc +++ b/src/routines/level2/xspr2.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xspr2.h" +#include "routines/level2/xspr2.hpp" #include diff --git a/src/routines/level2/xspr2.hpp b/src/routines/level2/xspr2.hpp new file mode 100644 index 00000000..693c56a1 --- /dev/null +++ b/src/routines/level2/xspr2.hpp @@ -0,0 +1,46 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xspr2 routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSPR2_H_ +#define CLBLAST_ROUTINES_XSPR2_H_ + +#include "routines/level2/xher2.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xspr2: public Xher2 { + public: + + // Uses the regular Xher2 routine + using Xher2::DoHer2; + + // Constructor + Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2"); + + // Templated-precision implementation of the routine + StatusCode DoSpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &ap_buffer, const size_t ap_offset); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSPR2_H_ +#endif diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc index 2a404a8a..648d2a3e 100644 --- a/src/routines/level2/xsymv.cc +++ b/src/routines/level2/xsymv.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xsymv.h" +#include "routines/level2/xsymv.hpp" #include #include diff --git a/src/routines/level2/xsymv.hpp b/src/routines/level2/xsymv.hpp new file mode 100644 index 00000000..67815f2f --- /dev/null +++ b/src/routines/level2/xsymv.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsymv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xsymv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYMV_H_ +#define CLBLAST_ROUTINES_XSYMV_H_ + +#include "routines/level2/xgemv.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsymv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::MatVec; + + // Constructor + Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV"); + + // Templated-precision implementation of the routine + StatusCode DoSymv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYMV_H_ +#endif diff --git a/src/routines/level2/xsyr.cc b/src/routines/level2/xsyr.cc index 892517d7..758d8f8f 100644 --- a/src/routines/level2/xsyr.cc +++ b/src/routines/level2/xsyr.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xsyr.h" +#include "routines/level2/xsyr.hpp" #include diff --git a/src/routines/level2/xsyr.hpp b/src/routines/level2/xsyr.hpp new file mode 100644 index 00000000..20393454 --- /dev/null +++ b/src/routines/level2/xsyr.hpp @@ -0,0 +1,45 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYR_H_ +#define CLBLAST_ROUTINES_XSYR_H_ + +#include "routines/level2/xher.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsyr: public Xher { + public: + + // Uses the regular Xher routine + using Xher::DoHer; + + // Constructor + Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR"); + + // Templated-precision implementation of the routine + StatusCode DoSyr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYR_H_ +#endif diff --git a/src/routines/level2/xsyr2.cc b/src/routines/level2/xsyr2.cc index e6dfd158..6f43b219 100644 --- a/src/routines/level2/xsyr2.cc +++ b/src/routines/level2/xsyr2.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xsyr2.h" +#include "routines/level2/xsyr2.hpp" #include diff --git a/src/routines/level2/xsyr2.hpp b/src/routines/level2/xsyr2.hpp new file mode 100644 index 00000000..1a8dcbe8 --- /dev/null +++ b/src/routines/level2/xsyr2.hpp @@ -0,0 +1,46 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2 routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYR2_H_ +#define CLBLAST_ROUTINES_XSYR2_H_ + +#include "routines/level2/xher2.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsyr2: public Xher2 { + public: + + // Uses the regular Xher2 routine + using Xher2::DoHer2; + + // Constructor + Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2"); + + // Templated-precision implementation of the routine + StatusCode DoSyr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYR2_H_ +#endif diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc index 86e28dfb..e315c544 100644 --- a/src/routines/level2/xtbmv.cc +++ b/src/routines/level2/xtbmv.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xtbmv.h" +#include "routines/level2/xtbmv.hpp" #include #include diff --git a/src/routines/level2/xtbmv.hpp b/src/routines/level2/xtbmv.hpp new file mode 100644 index 00000000..389e9705 --- /dev/null +++ b/src/routines/level2/xtbmv.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTBMV_H_ +#define CLBLAST_ROUTINES_XTBMV_H_ + +#include "routines/level2/xgemv.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xtbmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::queue_; + using Xgemv::context_; + using Xgemv::MatVec; + + // Constructor + Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV"); + + // Templated-precision implementation of the routine + StatusCode DoTbmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTBMV_H_ +#endif diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc index 72445547..46811089 100644 --- a/src/routines/level2/xtpmv.cc +++ b/src/routines/level2/xtpmv.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xtpmv.h" +#include "routines/level2/xtpmv.hpp" #include #include diff --git a/src/routines/level2/xtpmv.hpp b/src/routines/level2/xtpmv.hpp new file mode 100644 index 00000000..0e8cf1d2 --- /dev/null +++ b/src/routines/level2/xtpmv.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTPMV_H_ +#define CLBLAST_ROUTINES_XTPMV_H_ + +#include "routines/level2/xgemv.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xtpmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::queue_; + using Xgemv::context_; + using Xgemv::MatVec; + + // Constructor + Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV"); + + // Templated-precision implementation of the routine + StatusCode DoTpmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &ap_buffer, const size_t ap_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTPMV_H_ +#endif diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc index df6f85a3..d2f24252 100644 --- a/src/routines/level2/xtrmv.cc +++ b/src/routines/level2/xtrmv.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level2/xtrmv.h" +#include "routines/level2/xtrmv.hpp" #include #include diff --git a/src/routines/level2/xtrmv.hpp b/src/routines/level2/xtrmv.hpp new file mode 100644 index 00000000..07dd7841 --- /dev/null +++ b/src/routines/level2/xtrmv.hpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTRMV_H_ +#define CLBLAST_ROUTINES_XTRMV_H_ + +#include "routines/level2/xgemv.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xtrmv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::queue_; + using Xgemv::context_; + using Xgemv::MatVec; + + // Constructor + Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV"); + + // Templated-precision implementation of the routine + StatusCode DoTrmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTRMV_H_ +#endif diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 8386ad09..9ea5559c 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level3/xgemm.h" +#include "routines/level3/xgemm.hpp" #include #include diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp new file mode 100644 index 00000000..71723d78 --- /dev/null +++ b/src/routines/level3/xgemm.hpp @@ -0,0 +1,48 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgemm routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XGEMM_H_ +#define CLBLAST_ROUTINES_XGEMM_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xgemm: public Routine { + public: + + // Constructor + Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM"); + + // Templated-precision implementation of the routine + StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + protected: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XGEMM_H_ +#endif diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc index 8120c09c..9813503e 100644 --- a/src/routines/level3/xhemm.cc +++ b/src/routines/level3/xhemm.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level3/xhemm.h" +#include "routines/level3/xhemm.hpp" #include #include diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp new file mode 100644 index 00000000..d79b42a1 --- /dev/null +++ b/src/routines/level3/xhemm.hpp @@ -0,0 +1,54 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhemm routine. It is based on the generalized matrix multiplication +// routine (Xgemm). The implementation is very similar to the Xsymm routine. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHEMM_H_ +#define CLBLAST_ROUTINES_XHEMM_H_ + +#include "routines/level3/xgemm.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xhemm: public Xgemm { + public: + + // Uses methods and variables the regular Xgemm routine + using Xgemm::precision_; + using Xgemm::routine_name_; + using Xgemm::queue_; + using Xgemm::context_; + using Xgemm::device_; + using Xgemm::db_; + using Xgemm::DoGemm; + + // Constructor + Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM"); + + // Templated-precision implementation of the routine + StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHEMM_H_ +#endif diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index bd0f83dd..bd7a053e 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level3/xher2k.h" +#include "routines/level3/xher2k.hpp" #include #include diff --git a/src/routines/level3/xher2k.hpp b/src/routines/level3/xher2k.hpp new file mode 100644 index 00000000..23996219 --- /dev/null +++ b/src/routines/level3/xher2k.hpp @@ -0,0 +1,46 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xher2k routine. The precision is implemented using the template argument +// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the +// Xsyr2k routine. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHER2K_H_ +#define CLBLAST_ROUTINES_XHER2K_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xher2k: public Routine { + public: + + // Constructor + Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K"); + + // Templated-precision implementation of the routine + StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHER2K_H_ +#endif diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index 6155734a..6ef7f21f 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level3/xherk.h" +#include "routines/level3/xherk.hpp" #include #include diff --git a/src/routines/level3/xherk.hpp b/src/routines/level3/xherk.hpp new file mode 100644 index 00000000..3f156a1b --- /dev/null +++ b/src/routines/level3/xherk.hpp @@ -0,0 +1,45 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xherk routine. The precision is implemented using the template argument +// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the +// Xsyrk routine. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHERK_H_ +#define CLBLAST_ROUTINES_XHERK_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xherk: public Routine { + public: + + // Constructor + Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK"); + + // Templated-precision implementation of the routine + StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const U alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const U beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHERK_H_ +#endif diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc index c5e56617..04e4b718 100644 --- a/src/routines/level3/xsymm.cc +++ b/src/routines/level3/xsymm.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level3/xsymm.h" +#include "routines/level3/xsymm.hpp" #include #include diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp new file mode 100644 index 00000000..754dd7a0 --- /dev/null +++ b/src/routines/level3/xsymm.hpp @@ -0,0 +1,56 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsymm routine. It is based on the generalized matrix multiplication +// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the +// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by +// transforming it into a general matrix, and then calls the regular GEMM code. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYMM_H_ +#define CLBLAST_ROUTINES_XSYMM_H_ + +#include "routines/level3/xgemm.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsymm: public Xgemm { + public: + + // Uses methods and variables the regular Xgemm routine + using Xgemm::precision_; + using Xgemm::routine_name_; + using Xgemm::queue_; + using Xgemm::context_; + using Xgemm::device_; + using Xgemm::db_; + using Xgemm::DoGemm; + + // Constructor + Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM"); + + // Templated-precision implementation of the routine + StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYMM_H_ +#endif diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index f9655889..424d4d2d 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level3/xsyr2k.h" +#include "routines/level3/xsyr2k.hpp" #include #include diff --git a/src/routines/level3/xsyr2k.hpp b/src/routines/level3/xsyr2k.hpp new file mode 100644 index 00000000..56185653 --- /dev/null +++ b/src/routines/level3/xsyr2k.hpp @@ -0,0 +1,46 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2k routine. The precision is implemented using a template argument. +// The implementation is very similar to Xsyrk (see header for details), except for the fact that +// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYR2K_H_ +#define CLBLAST_ROUTINES_XSYR2K_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsyr2k: public Routine { + public: + + // Constructor + Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K"); + + // Templated-precision implementation of the routine + StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYR2K_H_ +#endif diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index bceb6afd..f56c232b 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level3/xsyrk.h" +#include "routines/level3/xsyrk.hpp" #include #include diff --git a/src/routines/level3/xsyrk.hpp b/src/routines/level3/xsyrk.hpp new file mode 100644 index 00000000..7c075c26 --- /dev/null +++ b/src/routines/level3/xsyrk.hpp @@ -0,0 +1,47 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyrk routine. The precision is implemented using a template argument. +// The implementation is based on the regular Xgemm routine and kernel, but with two main changes: +// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part. +// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for +// performance reasons, as the actual masking is done later (see the first point). +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYRK_H_ +#define CLBLAST_ROUTINES_XSYRK_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsyrk: public Routine { + public: + + // Constructor + Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK"); + + // Templated-precision implementation of the routine + StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYRK_H_ +#endif diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index 92dda9fb..74a82822 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/level3/xtrmm.h" +#include "routines/level3/xtrmm.hpp" #include #include diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp new file mode 100644 index 00000000..bb435592 --- /dev/null +++ b/src/routines/level3/xtrmm.hpp @@ -0,0 +1,54 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrmm routine. The implementation is based on first transforming the +// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM +// routine. Therefore, this class inherits from the Xgemm class. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTRMM_H_ +#define CLBLAST_ROUTINES_XTRMM_H_ + +#include "routines/level3/xgemm.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xtrmm: public Xgemm { + public: + + // Uses methods and variables the regular Xgemm routine + using Xgemm::precision_; + using Xgemm::routine_name_; + using Xgemm::queue_; + using Xgemm::context_; + using Xgemm::device_; + using Xgemm::db_; + using Xgemm::DoGemm; + + // Constructor + Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM"); + + // Templated-precision implementation of the routine + StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTRMM_H_ +#endif diff --git a/src/routines/levelx/xomatcopy.cc b/src/routines/levelx/xomatcopy.cc index 6e4bddb2..e8593301 100644 --- a/src/routines/levelx/xomatcopy.cc +++ b/src/routines/levelx/xomatcopy.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/routines/levelx/xomatcopy.h" +#include "routines/levelx/xomatcopy.hpp" #include #include diff --git a/src/routines/levelx/xomatcopy.hpp b/src/routines/levelx/xomatcopy.hpp new file mode 100644 index 00000000..0e580230 --- /dev/null +++ b/src/routines/levelx/xomatcopy.hpp @@ -0,0 +1,41 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xomatcopy routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XOMATCOPY_H_ +#define CLBLAST_ROUTINES_XOMATCOPY_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xomatcopy: public Routine { + public: + + // Constructor + Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY"); + + // Templated-precision implementation of the routine + StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XOMATCOPY_H_ +#endif diff --git a/src/tuning/copy_fast.cc b/src/tuning/copy_fast.cc deleted file mode 100644 index 09fdbaba..00000000 --- a/src/tuning/copy_fast.cc +++ /dev/null @@ -1,122 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneCopy { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "copy"; } - static std::string KernelName() { return "CopyMatrixFast"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/copy_fast.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32}); - tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32}); - tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8}); - tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"COPY_DIMX", "COPY_DIMY"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"COPY_VW", "COPY_WPT"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/copy_pad.cc b/src/tuning/copy_pad.cc deleted file mode 100644 index 7088b3bf..00000000 --- a/src/tuning/copy_pad.cc +++ /dev/null @@ -1,130 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TunePad { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "pad"; } - static std::string KernelName() { return "CopyPadMatrix"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/copy_pad.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32}); - tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32}); - tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4}); - tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"PAD_DIMX", "PAD_DIMY"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"PAD_WPTX", "PAD_WPTY"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentScalar(0); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/kernels/copy_fast.cc b/src/tuning/kernels/copy_fast.cc new file mode 100644 index 00000000..34269bc7 --- /dev/null +++ b/src/tuning/kernels/copy_fast.cc @@ -0,0 +1,122 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneCopy { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "copy"; } + static std::string KernelName() { return "CopyMatrixFast"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/copy_fast.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32}); + tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32}); + tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8}); + tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"COPY_DIMX", "COPY_DIMY"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"COPY_VW", "COPY_WPT"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentInput(alpha_buffer); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/copy_pad.cc b/src/tuning/kernels/copy_pad.cc new file mode 100644 index 00000000..1e0dccd3 --- /dev/null +++ b/src/tuning/kernels/copy_pad.cc @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TunePad { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "pad"; } + static std::string KernelName() { return "CopyPadMatrix"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/copy_pad.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32}); + tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32}); + tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4}); + tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"PAD_DIMX", "PAD_DIMY"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"PAD_WPTX", "PAD_WPTY"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(0); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/transpose_fast.cc b/src/tuning/kernels/transpose_fast.cc new file mode 100644 index 00000000..7ac19cb6 --- /dev/null +++ b/src/tuning/kernels/transpose_fast.cc @@ -0,0 +1,127 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneTranspose { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "transpose"; } + static std::string KernelName() { return "TransposeMatrixFast"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/transpose_fast.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64}); + tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16}); + tuner.AddParameter(id, "TRA_PAD", {0, 1}); + tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { + auto LocalMemorySize = [args] (std::vector v) { + return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"}); + } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"TRA_DIM", "TRA_DIM"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"TRA_WPT", "TRA_WPT"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentInput(alpha_buffer); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/transpose_pad.cc b/src/tuning/kernels/transpose_pad.cc new file mode 100644 index 00000000..63274415 --- /dev/null +++ b/src/tuning/kernels/transpose_pad.cc @@ -0,0 +1,134 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TunePadTranspose { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "padtranspose"; } + static std::string KernelName() { return "TransposePadMatrix"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/transpose_pad.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64}); + tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16}); + tuner.AddParameter(id, "PADTRA_PAD", {0, 1}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { + auto LocalMemorySize = [args] (std::vector v) { + return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"}); + } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"PADTRA_TILE", "PADTRA_TILE"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"PADTRA_WPT", "PADTRA_WPT"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(0); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/xaxpy.cc b/src/tuning/kernels/xaxpy.cc new file mode 100644 index 00000000..88d12c1f --- /dev/null +++ b/src/tuning/kernels/xaxpy.cc @@ -0,0 +1,125 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the xaxpy OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneXaxpy { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "xaxpy"; } + static std::string KernelName() { return "XaxpyFast"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level1/level1.opencl" + #include "../src/kernels/level1/xaxpy.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgN, kArgAlpha}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &args) { + if (!IsMultiple(args.n, 64)) { + throw std::runtime_error("'XaxpyFast' requires 'n' to be a multiple of WGS*WPT*VW"); + } + } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1; } // N/A for this kernel + static size_t DefaultN() { return 4096*1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { return args.n; } + static size_t GetSizeY(const Arguments &args) { return args.n; } + static size_t GetSizeA(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 2048}); + tuner.AddParameter(id, "WPT", {1, 2, 4, 8}); + tuner.AddParameter(id, "VW", {1, 2, 4, 8}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1}; } + static std::vector LocalSizeRef() { return {64}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"WGS"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"WPT"},{"VW"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &x_vec, std::vector &y_vec, + std::vector &, std::vector &, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentOutput(y_vec); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 3 * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/xdot.cc b/src/tuning/kernels/xdot.cc new file mode 100644 index 00000000..1581e13f --- /dev/null +++ b/src/tuning/kernels/xdot.cc @@ -0,0 +1,137 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the xdot OpenCL kernels. Note that the results are +// not verified, since the result is not final and depends on the WGS2 parameter. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneXdot { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "xdot_"+std::to_string(V); } + static std::string KernelName() { return (V==1) ? "Xdot" : "XdotEpilogue"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level1/xdot.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgN}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1; } // N/A for this kernel + static size_t DefaultN() { return 2*1024*1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { return args.n; } + static size_t GetSizeY(const Arguments &args) { return args.n; } + static size_t GetSizeA(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &args) { return args.n; } // Worst case + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &) { return (V==1) ? std::vector{2*64} : std::vector{1}; } + static std::vector GlobalSizeRef(const Arguments &) { return (V==1) ? std::vector{2*64*64} : std::vector{64}; } + static std::vector LocalSize() { return {1}; } + static std::vector LocalSizeRef() { return {64}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } + static TransformVector DivGlobal() { return {}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &x_vec, std::vector &y_vec, + std::vector &, std::vector &, std::vector &, + std::vector &temp) { + if (V == 1) { + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentInput(y_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentInput(temp); // No output checking for the result - size varies + tuner.AddArgumentScalar(static_cast(false)); + } + else { + tuner.AddArgumentInput(temp); + tuner.AddArgumentInput(x_vec); // No output checking for the result - store somewhere + tuner.AddArgumentScalar(0); + } + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return (V==1) ? "GB/s" : "N/A"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Function to tune a specific variation V (not within the clblast namespace) +template +void StartVariation(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } +} + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/xgemm.cc b/src/tuning/kernels/xgemm.cc new file mode 100644 index 00000000..4b1efdef --- /dev/null +++ b/src/tuning/kernels/xgemm.cc @@ -0,0 +1,162 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneXgemm { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "xgemm"; } + static std::string KernelName() { return "Xgemm"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/xgemm_part1.opencl" + #include "../src/kernels/level3/xgemm_part2.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction}; + } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1024; } + static double DefaultFraction() { return 2048.0; } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.k; } + static size_t GetSizeB(const Arguments &args) { return args.n * args.k; } + static size_t GetSizeC(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); + tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); + tuner.AddParameter(id, "KWG", {16, 32}); + tuner.AddParameter(id, "MDIMC", {8, 16, 32}); + tuner.AddParameter(id, "NDIMC", {8, 16, 32}); + tuner.AddParameter(id, "MDIMA", {8, 16, 32}); + tuner.AddParameter(id, "NDIMB", {8, 16, 32}); + tuner.AddParameter(id, "KWI", {2, 8}); + tuner.AddParameter(id, "VWM", {1, 2, 4, 8}); + tuner.AddParameter(id, "VWN", {1, 2, 4, 8}); + tuner.AddParameter(id, "STRM", {0, 1}); + tuner.AddParameter(id, "STRN", {0, 1}); + tuner.AddParameter(id, "SA", {0, 1}); + tuner.AddParameter(id, "SB", {0, 1}); + } + + // Sets the constraints + static void SetConstraints(cltune::Tuner &tuner, const size_t id) { + auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; + auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; + auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; + // Requirement for unrolling the KWG loop + tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"}); + // Required for integer MWI and NWI + tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}); + tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}); + // Required for integer MWIA and NWIB + tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}); + tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}); + // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) + tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}); + tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}); + } + + // Sets the local memory size + static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { + auto LocalMemorySize = [args] (std::vector v) { + return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM", + "SB", "KWG", "NWG", "VWN"}); + } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"MDIMC", "NDIMC"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {{"MDIMC", "NDIMC"}}; } + static TransformVector DivGlobal() { return {{"MWG", "NWG"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &c_mat, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + auto beta_buffer = std::vector{args.beta}; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.k)); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentInput(beta_buffer); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentInput(b_mat); + tuner.AddArgumentOutput(c_mat); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * args.k; + } + static std::string PerformanceUnit() { return "GFLOPS"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/xgemv.cc b/src/tuning/kernels/xgemv.cc new file mode 100644 index 00000000..d42155ae --- /dev/null +++ b/src/tuning/kernels/xgemv.cc @@ -0,0 +1,156 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned: +// 1: The full version of the kernel +// 2: The fast version for non-transposed matrices +// 3: The fast version for transposed matrices +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneXgemv { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "xgemv_"+std::to_string(V); } + static std::string KernelName() { return (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level2/xgemv.opencl" + #include "../src/kernels/level2/xgemv_fast.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha, kArgBeta}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 2048; } + static size_t DefaultN() { return 2048; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { return args.n; } + static size_t GetSizeY(const Arguments &args) { return args.m; } + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256}); + tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); + if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); } + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &tuner, const size_t id) { + auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; + if (V==2 || V==3) { + tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}); + } + } + static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { + auto LocalMemorySize = [args] (std::vector v) { return v[0]*GetBytes(args.precision); }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)}); + } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1}; } + static std::vector LocalSizeRef() { return {64}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"WGS"+std::to_string(V)}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"WPT"+std::to_string(V)}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &x_vec, std::vector &y_vec, + std::vector &a_mat, std::vector &, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + auto beta_buffer = std::vector{args.beta}; + auto a_rotated = (V==3) ? 1 : 0; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentInput(beta_buffer); + tuner.AddArgumentScalar(static_cast(a_rotated)); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentOutput(y_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentScalar(0); // Conjugate transpose + tuner.AddArgumentScalar(0); // Additional parameter + tuner.AddArgumentScalar(0); // Banded 'kl' + tuner.AddArgumentScalar(0); // Banded 'ku' + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return (args.m*args.n + 2*args.m + args.n) * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Function to tune a specific variation V (not within the clblast namespace) +template +void StartVariation(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } +} + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); + StartVariation<3>(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/xger.cc b/src/tuning/kernels/xger.cc new file mode 100644 index 00000000..d2590c53 --- /dev/null +++ b/src/tuning/kernels/xger.cc @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the xger OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneXger { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "xger"; } + static std::string KernelName() { return "Xger"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level2/level2.opencl" + #include "../src/kernels/level2/xger.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgN, kArgM, kArgAlpha}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { return args.m; } + static size_t GetSizeY(const Arguments &args) { return args.n; } + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "WGS1", {4, 8, 16, 32, 64, 128, 256, 512}); + tuner.AddParameter(id, "WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256}); + tuner.AddParameter(id, "WPT", {1, 2, 4}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"WGS1", "WGS2"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"WPT", "WPT"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &x_vec, std::vector &y_vec, + std::vector &a_mat, std::vector &, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentScalar(0); // x_offset + tuner.AddArgumentScalar(1); // x_increment + tuner.AddArgumentInput(y_vec); + tuner.AddArgumentScalar(0); // y_offset + tuner.AddArgumentScalar(1); // y_increment + tuner.AddArgumentOutput(a_mat); + tuner.AddArgumentScalar(0); // a_offset + tuner.AddArgumentScalar(static_cast(args.m)); // a_ld + tuner.AddArgumentScalar(0); // a_is_rowmajor + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/transpose_fast.cc b/src/tuning/transpose_fast.cc deleted file mode 100644 index 3b0bdeb5..00000000 --- a/src/tuning/transpose_fast.cc +++ /dev/null @@ -1,127 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneTranspose { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "transpose"; } - static std::string KernelName() { return "TransposeMatrixFast"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/transpose_fast.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64}); - tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16}); - tuner.AddParameter(id, "TRA_PAD", {0, 1}); - tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { - return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"}); - } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"TRA_DIM", "TRA_DIM"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"TRA_WPT", "TRA_WPT"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/transpose_pad.cc b/src/tuning/transpose_pad.cc deleted file mode 100644 index b9ab3ffa..00000000 --- a/src/tuning/transpose_pad.cc +++ /dev/null @@ -1,134 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TunePadTranspose { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "padtranspose"; } - static std::string KernelName() { return "TransposePadMatrix"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/transpose_pad.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64}); - tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16}); - tuner.AddParameter(id, "PADTRA_PAD", {0, 1}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { - return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"}); - } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"PADTRA_TILE", "PADTRA_TILE"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"PADTRA_WPT", "PADTRA_WPT"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentScalar(0); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp new file mode 100644 index 00000000..19df5f9a --- /dev/null +++ b/src/tuning/tuning.hpp @@ -0,0 +1,161 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the interface to the CLTune auto-tuner. This is only used for the optional +// and stand-alone tuner binaries and not part of the core of CLBlast. +// +// ================================================================================================= + +#ifndef CLBLAST_TUNING_H_ +#define CLBLAST_TUNING_H_ + +#include +#include + +#include + +#include "utilities.hpp" + +namespace clblast { +// ================================================================================================= + +// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect +// the results. Used for all types of kernel families. Note that this is a header-only function so +// that it is automatically compiled for the various kernels (given as the 'C' template argument). +template +void Tuner(int argc, char* argv[]) { + + // Sets the parameters and platform/device for which to tune (command-line options) + auto help = std::string{"* Options given/available:\n"}; + auto args = Arguments{}; + args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); + args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); + args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); + for (auto &o: C::GetOptions()) { + if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, C::DefaultM()); } + if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, C::DefaultN()); } + if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, C::DefaultK()); } + if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar()); } + if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar()); } + if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); } + } + fprintf(stdout, "%s\n", help.c_str()); + + // Tests validity of the given arguments + C::TestValidArguments(args); + + // Tests for validity of the precision and retrieves properties + auto isAMD = false; + auto isARM = false; + auto isGPU = false; + { + const auto platform = Platform(args.platform_id); + const auto device = Device(platform, args.device_id); + if (!PrecisionSupported(device)) { + printf("* Unsupported precision, skipping this tuning run\n\n"); + return; + } + isAMD = device.IsAMD(); + isARM = device.IsARM(); + isGPU = device.IsGPU(); + } + + // Creates input buffers with random data + auto x_vec = std::vector(C::GetSizeX(args)); + auto y_vec = std::vector(C::GetSizeY(args)); + auto a_mat = std::vector(C::GetSizeA(args)); + auto b_mat = std::vector(C::GetSizeB(args)); + auto c_mat = std::vector(C::GetSizeC(args)); + auto temp = std::vector(C::GetSizeTemp(args)); + PopulateVector(x_vec); + PopulateVector(y_vec); + PopulateVector(a_mat); + PopulateVector(b_mat); + PopulateVector(c_mat); + PopulateVector(temp); + + // Initializes the tuner for the chosen device + cltune::Tuner tuner(args.platform_id, args.device_id); + + // Use full-search to explore all parameter combinations or random-search to search only a part of + // the parameter values. The fraction is set as a command-line argument. + if (args.fraction == 1.0 || args.fraction == 0.0) { + tuner.UseFullSearch(); + } + else { + tuner.UseRandomSearch(1.0/args.fraction); + } + + // Set extra settings for specific defines. This mimics src/routine.cc. + auto defines = std::string{""}; + if (isAMD && isGPU) { + defines += "#define USE_CL_MAD 1\n"; + defines += "#define USE_STAGGERED_INDICES 1\n"; + } + if (isARM && isGPU) { + defines += "#define GLOBAL_MEM_FENCE 1\n"; + } + + // Loads the kernel sources and defines the kernel to tune + auto sources = defines + C::GetSources(); + auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize()); + tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef()); + + // Sets the tunable parameters and their possible values + C::SetParameters(tuner, id); + C::SetConstraints(tuner, id); + C::SetLocalMemorySize(tuner, id, args); + + // Tests for a specific precision + tuner.AddParameter(id, "PRECISION", {static_cast(args.precision)}); + tuner.AddParameterReference("PRECISION", static_cast(args.precision)); + + // Modifies the thread-sizes (both global and local) based on the parameters + for (auto ¶meters: C::MulLocal()) { tuner.MulLocalSize(id, parameters); } + for (auto ¶meters: C::DivLocal()) { tuner.DivLocalSize(id, parameters); } + for (auto ¶meters: C::MulGlobal()) { tuner.MulGlobalSize(id, parameters); } + for (auto ¶meters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); } + + // Sets the function's arguments + C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp); + + // Starts the tuning process + tuner.Tune(); + + // Prints the results to screen + auto time_ms = tuner.PrintToScreen(); + tuner.PrintFormatted(); + + // Also prints the performance of the best-case in terms of GB/s or GFLOPS + if (time_ms != 0.0) { + printf("[ -------> ] %.1lf ms", time_ms); + printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str()); + } + + // Outputs the results as JSON to disk, including some meta-data + auto precision_string = std::to_string(static_cast(args.precision)); + auto metadata = std::vector>{ + {"kernel_family", C::KernelFamily()}, + {"precision", precision_string} + }; + for (auto &o: C::GetOptions()) { + if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); } + if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); } + if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); } + if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); } + if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); } + } + tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata); +} + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TUNING_H_ +#endif diff --git a/src/tuning/xaxpy.cc b/src/tuning/xaxpy.cc deleted file mode 100644 index d27cb73d..00000000 --- a/src/tuning/xaxpy.cc +++ /dev/null @@ -1,125 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the xaxpy OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneXaxpy { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "xaxpy"; } - static std::string KernelName() { return "XaxpyFast"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level1/level1.opencl" - #include "../src/kernels/level1/xaxpy.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgN, kArgAlpha}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &args) { - if (!IsMultiple(args.n, 64)) { - throw std::runtime_error("'XaxpyFast' requires 'n' to be a multiple of WGS*WPT*VW"); - } - } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1; } // N/A for this kernel - static size_t DefaultN() { return 4096*1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.n; } - static size_t GetSizeY(const Arguments &args) { return args.n; } - static size_t GetSizeA(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 2048}); - tuner.AddParameter(id, "WPT", {1, 2, 4, 8}); - tuner.AddParameter(id, "VW", {1, 2, 4, 8}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1}; } - static std::vector LocalSizeRef() { return {64}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"WGS"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"WPT"},{"VW"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &x_vec, std::vector &y_vec, - std::vector &, std::vector &, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentOutput(y_vec); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 3 * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/xdot.cc b/src/tuning/xdot.cc deleted file mode 100644 index 5f30296c..00000000 --- a/src/tuning/xdot.cc +++ /dev/null @@ -1,137 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the xdot OpenCL kernels. Note that the results are -// not verified, since the result is not final and depends on the WGS2 parameter. -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneXdot { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "xdot_"+std::to_string(V); } - static std::string KernelName() { return (V==1) ? "Xdot" : "XdotEpilogue"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level1/xdot.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgN}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1; } // N/A for this kernel - static size_t DefaultN() { return 2*1024*1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.n; } - static size_t GetSizeY(const Arguments &args) { return args.n; } - static size_t GetSizeA(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &args) { return args.n; } // Worst case - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &) { return (V==1) ? std::vector{2*64} : std::vector{1}; } - static std::vector GlobalSizeRef(const Arguments &) { return (V==1) ? std::vector{2*64*64} : std::vector{64}; } - static std::vector LocalSize() { return {1}; } - static std::vector LocalSizeRef() { return {64}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } - static TransformVector DivGlobal() { return {}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &x_vec, std::vector &y_vec, - std::vector &, std::vector &, std::vector &, - std::vector &temp) { - if (V == 1) { - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentInput(y_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentInput(temp); // No output checking for the result - size varies - tuner.AddArgumentScalar(static_cast(false)); - } - else { - tuner.AddArgumentInput(temp); - tuner.AddArgumentInput(x_vec); // No output checking for the result - store somewhere - tuner.AddArgumentScalar(0); - } - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return (V==1) ? "GB/s" : "N/A"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Function to tune a specific variation V (not within the clblast namespace) -template -void StartVariation(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } -} - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - StartVariation<1>(argc, argv); - StartVariation<2>(argc, argv); - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/xgemm.cc b/src/tuning/xgemm.cc deleted file mode 100644 index d309b830..00000000 --- a/src/tuning/xgemm.cc +++ /dev/null @@ -1,162 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneXgemm { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "xgemm"; } - static std::string KernelName() { return "Xgemm"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/xgemm_part1.opencl" - #include "../src/kernels/level3/xgemm_part2.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction}; - } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1024; } - static double DefaultFraction() { return 2048.0; } - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.k; } - static size_t GetSizeB(const Arguments &args) { return args.n * args.k; } - static size_t GetSizeC(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "KWG", {16, 32}); - tuner.AddParameter(id, "MDIMC", {8, 16, 32}); - tuner.AddParameter(id, "NDIMC", {8, 16, 32}); - tuner.AddParameter(id, "MDIMA", {8, 16, 32}); - tuner.AddParameter(id, "NDIMB", {8, 16, 32}); - tuner.AddParameter(id, "KWI", {2, 8}); - tuner.AddParameter(id, "VWM", {1, 2, 4, 8}); - tuner.AddParameter(id, "VWN", {1, 2, 4, 8}); - tuner.AddParameter(id, "STRM", {0, 1}); - tuner.AddParameter(id, "STRN", {0, 1}); - tuner.AddParameter(id, "SA", {0, 1}); - tuner.AddParameter(id, "SB", {0, 1}); - } - - // Sets the constraints - static void SetConstraints(cltune::Tuner &tuner, const size_t id) { - auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; - auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; - auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; - // Requirement for unrolling the KWG loop - tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"}); - // Required for integer MWI and NWI - tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}); - tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}); - // Required for integer MWIA and NWIB - tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}); - tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}); - // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) - tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}); - tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}); - } - - // Sets the local memory size - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { - return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM", - "SB", "KWG", "NWG", "VWN"}); - } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"MDIMC", "NDIMC"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {{"MDIMC", "NDIMC"}}; } - static TransformVector DivGlobal() { return {{"MWG", "NWG"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &c_mat, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - auto beta_buffer = std::vector{args.beta}; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.k)); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentInput(beta_buffer); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentInput(b_mat); - tuner.AddArgumentOutput(c_mat); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * args.k; - } - static std::string PerformanceUnit() { return "GFLOPS"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc deleted file mode 100644 index 6587dcf4..00000000 --- a/src/tuning/xgemv.cc +++ /dev/null @@ -1,156 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned: -// 1: The full version of the kernel -// 2: The fast version for non-transposed matrices -// 3: The fast version for transposed matrices -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneXgemv { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "xgemv_"+std::to_string(V); } - static std::string KernelName() { return (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level2/xgemv.opencl" - #include "../src/kernels/level2/xgemv_fast.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha, kArgBeta}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 2048; } - static size_t DefaultN() { return 2048; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.n; } - static size_t GetSizeY(const Arguments &args) { return args.m; } - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256}); - tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); - if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); } - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &tuner, const size_t id) { - auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; - if (V==2 || V==3) { - tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}); - } - } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { return v[0]*GetBytes(args.precision); }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)}); - } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1}; } - static std::vector LocalSizeRef() { return {64}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"WGS"+std::to_string(V)}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"WPT"+std::to_string(V)}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &x_vec, std::vector &y_vec, - std::vector &a_mat, std::vector &, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - auto beta_buffer = std::vector{args.beta}; - auto a_rotated = (V==3) ? 1 : 0; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentInput(beta_buffer); - tuner.AddArgumentScalar(static_cast(a_rotated)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentOutput(y_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentScalar(0); // Conjugate transpose - tuner.AddArgumentScalar(0); // Additional parameter - tuner.AddArgumentScalar(0); // Banded 'kl' - tuner.AddArgumentScalar(0); // Banded 'ku' - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return (args.m*args.n + 2*args.m + args.n) * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Function to tune a specific variation V (not within the clblast namespace) -template -void StartVariation(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } -} - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - StartVariation<1>(argc, argv); - StartVariation<2>(argc, argv); - StartVariation<3>(argc, argv); - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/xger.cc b/src/tuning/xger.cc deleted file mode 100644 index 4be80c86..00000000 --- a/src/tuning/xger.cc +++ /dev/null @@ -1,130 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the xger OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneXger { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "xger"; } - static std::string KernelName() { return "Xger"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level2/level2.opencl" - #include "../src/kernels/level2/xger.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgN, kArgM, kArgAlpha}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.m; } - static size_t GetSizeY(const Arguments &args) { return args.n; } - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS1", {4, 8, 16, 32, 64, 128, 256, 512}); - tuner.AddParameter(id, "WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256}); - tuner.AddParameter(id, "WPT", {1, 2, 4}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"WGS1", "WGS2"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"WPT", "WPT"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &x_vec, std::vector &y_vec, - std::vector &a_mat, std::vector &, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); // x_offset - tuner.AddArgumentScalar(1); // x_increment - tuner.AddArgumentInput(y_vec); - tuner.AddArgumentScalar(0); // y_offset - tuner.AddArgumentScalar(1); // y_increment - tuner.AddArgumentOutput(a_mat); - tuner.AddArgumentScalar(0); // a_offset - tuner.AddArgumentScalar(static_cast(args.m)); // a_ld - tuner.AddArgumentScalar(0); // a_is_rowmajor - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/utilities.cc b/src/utilities.cc index 30b09a5f..e3a1fb75 100644 --- a/src/utilities.cc +++ b/src/utilities.cc @@ -11,7 +11,7 @@ // // ================================================================================================= -#include "internal/utilities.h" +#include "utilities.hpp" #include #include diff --git a/src/utilities.hpp b/src/utilities.hpp new file mode 100644 index 00000000..9a2b9ffc --- /dev/null +++ b/src/utilities.hpp @@ -0,0 +1,257 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file provides declarations for the common (test) utility functions such as a command-line +// argument parser. On top of this, it serves as the 'common' header, including the C++ OpenCL +// wrapper. These utilities are not only used for CLBlast, but also included as part of the tuners, +// the performance client and the correctness testers. +// +// ================================================================================================= + +#ifndef CLBLAST_UTILITIES_H_ +#define CLBLAST_UTILITIES_H_ + +#include +#include +#include + +#include "clblast.h" +#include "clblast_half.h" +#include "clpp11.hpp" + +namespace clblast { +// ================================================================================================= + +// Shorthands for complex data-types +using float2 = std::complex; +using double2 = std::complex; + +// Khronos OpenCL extensions +const std::string kKhronosHalfPrecision = "cl_khr_fp16"; +const std::string kKhronosDoublePrecision = "cl_khr_fp64"; + +// Catched an unknown error +constexpr auto kUnknownError = -999; + +// ================================================================================================= + +// The routine-specific arguments in string form +constexpr auto kArgM = "m"; +constexpr auto kArgN = "n"; +constexpr auto kArgK = "k"; +constexpr auto kArgKL = "kl"; +constexpr auto kArgKU = "ku"; +constexpr auto kArgLayout = "layout"; +constexpr auto kArgATransp = "transA"; +constexpr auto kArgBTransp = "transB"; +constexpr auto kArgSide = "side"; +constexpr auto kArgTriangle = "triangle"; +constexpr auto kArgDiagonal = "diagonal"; +constexpr auto kArgXInc = "incx"; +constexpr auto kArgYInc = "incy"; +constexpr auto kArgXOffset = "offx"; +constexpr auto kArgYOffset = "offy"; +constexpr auto kArgALeadDim = "lda"; +constexpr auto kArgBLeadDim = "ldb"; +constexpr auto kArgCLeadDim = "ldc"; +constexpr auto kArgAOffset = "offa"; +constexpr auto kArgBOffset = "offb"; +constexpr auto kArgCOffset = "offc"; +constexpr auto kArgAPOffset = "offap"; +constexpr auto kArgDotOffset = "offdot"; +constexpr auto kArgNrm2Offset = "offnrm2"; +constexpr auto kArgAsumOffset = "offasum"; +constexpr auto kArgImaxOffset = "offimax"; +constexpr auto kArgAlpha = "alpha"; +constexpr auto kArgBeta = "beta"; + +// The tuner-specific arguments in string form +constexpr auto kArgFraction = "fraction"; + +// The client-specific arguments in string form +constexpr auto kArgCompareclblas = "clblas"; +constexpr auto kArgComparecblas = "cblas"; +constexpr auto kArgStepSize = "step"; +constexpr auto kArgNumSteps = "num_steps"; +constexpr auto kArgNumRuns = "runs"; + +// The client-specific arguments in string form +constexpr auto kArgFullTest = "full_test"; +constexpr auto kArgVerbose = "verbose"; + +// The common arguments in string form +constexpr auto kArgPlatform = "platform"; +constexpr auto kArgDevice = "device"; +constexpr auto kArgPrecision = "precision"; +constexpr auto kArgHelp = "h"; +constexpr auto kArgQuiet = "q"; +constexpr auto kArgNoAbbreviations = "no_abbrv"; + +// ================================================================================================= + +// Returns a scalar with a default value +template +T GetScalar(); + +// Returns a scalar of value 1 +template +T ConstantOne(); + +// ================================================================================================= + +// Structure containing all possible arguments for test clients, including their default values +template +struct Arguments { + // Routine-specific arguments + size_t m = 1; + size_t n = 1; + size_t k = 1; + size_t ku = 1; + size_t kl = 1; + Layout layout = Layout::kRowMajor; + Transpose a_transpose = Transpose::kNo; + Transpose b_transpose = Transpose::kNo; + Side side = Side::kLeft; + Triangle triangle = Triangle::kUpper; + Diagonal diagonal = Diagonal::kUnit; + size_t x_inc = 1; + size_t y_inc = 1; + size_t x_offset = 0; + size_t y_offset = 0; + size_t a_ld = 1; + size_t b_ld = 1; + size_t c_ld = 1; + size_t a_offset = 0; + size_t b_offset = 0; + size_t c_offset = 0; + size_t ap_offset = 0; + size_t dot_offset = 0; + size_t nrm2_offset = 0; + size_t asum_offset = 0; + size_t imax_offset = 0; + T alpha = ConstantOne(); + T beta = ConstantOne(); + size_t x_size = 1; + size_t y_size = 1; + size_t a_size = 1; + size_t b_size = 1; + size_t c_size = 1; + size_t ap_size = 1; + size_t scalar_size = 1; + // Tuner-specific arguments + double fraction = 1.0; + // Client-specific arguments + int compare_clblas = 1; + int compare_cblas = 1; + size_t step = 1; + size_t num_steps = 0; + size_t num_runs = 10; + // Common arguments + size_t platform_id = 0; + size_t device_id = 0; + Precision precision = Precision::kSingle; + bool print_help = false; + bool silent = false; + bool no_abbrv = false; +}; + +// Structure containing all possible buffers for test clients +template +struct Buffers { + Buffer x_vec; + Buffer y_vec; + Buffer a_mat; + Buffer b_mat; + Buffer c_mat; + Buffer ap_mat; + Buffer scalar; +}; + +// ================================================================================================= + +// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast +// data-types such as the Layout and Transpose data-types. +template +std::string ToString(T value); + +// ================================================================================================= + +// Helper for the function "GetArgument" +template +T ConvertArgument(const char* value); + +// Basic argument parser, matching patterns in the form of "-option value" and "--option value" +template +T GetArgument(const int argc, char *argv[], std::string &help, + const std::string &option, const T default_value); + +// Returns the precision only +Precision GetPrecision(const int argc, char *argv[], + const Precision default_precision = Precision::kSingle); + +// As in "GetArgument", but now only checks whether an argument is given or not +bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option); + +// ================================================================================================= + +// Helper function to check for errors in the status code +constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); } + +// ================================================================================================= + +// Returns a random number to be used as a seed +unsigned int GetRandomSeed(); + +// Test/example data lower and upper limit +constexpr auto kTestDataLowerLimit = -2.0; +constexpr auto kTestDataUpperLimit = 2.0; + +// Populates a vector with random data +template +void PopulateVector(std::vector &vector); + +// ================================================================================================= + +// Conversion between half and single-precision +std::vector HalfToFloatBuffer(const std::vector& source); +void FloatToHalfBuffer(std::vector& result, const std::vector& source); + +// As above, but now for OpenCL data-types instead of std::vectors +Buffer HalfToFloatBuffer(const Buffer& source, cl_command_queue queue_raw); +void FloatToHalfBuffer(Buffer& result, const Buffer& source, cl_command_queue queue_raw); + +// ================================================================================================= + +// Rounding functions +size_t CeilDiv(const size_t x, const size_t y); +size_t Ceil(const size_t x, const size_t y); + +// Returns whether or not 'a' is a multiple of 'b' +bool IsMultiple(const size_t a, const size_t b); + +// ================================================================================================= + +// Convert the precision enum into bytes, e.g. a double takes up 8 bytes +size_t GetBytes(const Precision precision); + +// Convert the template argument into a precision value +template +Precision PrecisionValue(); + +// ================================================================================================= + +// Returns false is this precision is not supported by the device +template +bool PrecisionSupported(const Device &device); + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_UTILITIES_H_ +#endif -- cgit v1.2.3