summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/buffer_test.hpp121
-rw-r--r--src/cache.cc2
-rw-r--r--src/cache.hpp98
-rw-r--r--src/clblast.cc92
-rw-r--r--src/clblast_c.cc2
-rw-r--r--src/clpp11.hpp695
-rw-r--r--src/database/database.cc (renamed from src/database.cc)24
-rw-r--r--src/database/database.hpp104
-rw-r--r--src/database/kernels/copy.hpp262
-rw-r--r--src/database/kernels/pad.hpp270
-rw-r--r--src/database/kernels/padtranspose.hpp270
-rw-r--r--src/database/kernels/transpose.hpp258
-rw-r--r--src/database/kernels/xaxpy.hpp270
-rw-r--r--src/database/kernels/xdot.hpp200
-rw-r--r--src/database/kernels/xgemm.hpp263
-rw-r--r--src/database/kernels/xgemv.hpp231
-rw-r--r--src/database/kernels/xger.hpp220
-rw-r--r--src/public_api.hpp34
-rw-r--r--src/routine.cc2
-rw-r--r--src/routine.hpp68
-rw-r--r--src/routines/common.cc2
-rw-r--r--src/routines/common.hpp173
-rw-r--r--src/routines/level1/xamax.cc2
-rw-r--r--src/routines/level1/xamax.hpp40
-rw-r--r--src/routines/level1/xasum.cc2
-rw-r--r--src/routines/level1/xasum.hpp40
-rw-r--r--src/routines/level1/xaxpy.cc2
-rw-r--r--src/routines/level1/xaxpy.hpp40
-rw-r--r--src/routines/level1/xcopy.cc2
-rw-r--r--src/routines/level1/xcopy.hpp40
-rw-r--r--src/routines/level1/xdot.cc2
-rw-r--r--src/routines/level1/xdot.hpp42
-rw-r--r--src/routines/level1/xdotc.cc2
-rw-r--r--src/routines/level1/xdotc.hpp44
-rw-r--r--src/routines/level1/xdotu.cc2
-rw-r--r--src/routines/level1/xdotu.hpp44
-rw-r--r--src/routines/level1/xmax.hpp49
-rw-r--r--src/routines/level1/xmin.hpp49
-rw-r--r--src/routines/level1/xnrm2.cc2
-rw-r--r--src/routines/level1/xnrm2.hpp40
-rw-r--r--src/routines/level1/xscal.cc2
-rw-r--r--src/routines/level1/xscal.hpp39
-rw-r--r--src/routines/level1/xsum.hpp49
-rw-r--r--src/routines/level1/xswap.cc2
-rw-r--r--src/routines/level1/xswap.hpp40
-rw-r--r--src/routines/level2/xgbmv.cc2
-rw-r--r--src/routines/level2/xgbmv.hpp49
-rw-r--r--src/routines/level2/xgemv.cc2
-rw-r--r--src/routines/level2/xgemv.hpp56
-rw-r--r--src/routines/level2/xger.cc2
-rw-r--r--src/routines/level2/xger.hpp43
-rw-r--r--src/routines/level2/xgerc.cc2
-rw-r--r--src/routines/level2/xgerc.hpp46
-rw-r--r--src/routines/level2/xgeru.cc2
-rw-r--r--src/routines/level2/xgeru.hpp46
-rw-r--r--src/routines/level2/xhbmv.cc2
-rw-r--r--src/routines/level2/xhbmv.hpp49
-rw-r--r--src/routines/level2/xhemv.cc2
-rw-r--r--src/routines/level2/xhemv.hpp49
-rw-r--r--src/routines/level2/xher.cc2
-rw-r--r--src/routines/level2/xher.hpp46
-rw-r--r--src/routines/level2/xher2.cc2
-rw-r--r--src/routines/level2/xher2.hpp44
-rw-r--r--src/routines/level2/xhpmv.cc2
-rw-r--r--src/routines/level2/xhpmv.hpp49
-rw-r--r--src/routines/level2/xhpr.cc2
-rw-r--r--src/routines/level2/xhpr.hpp45
-rw-r--r--src/routines/level2/xhpr2.cc2
-rw-r--r--src/routines/level2/xhpr2.hpp46
-rw-r--r--src/routines/level2/xsbmv.cc2
-rw-r--r--src/routines/level2/xsbmv.hpp49
-rw-r--r--src/routines/level2/xspmv.cc2
-rw-r--r--src/routines/level2/xspmv.hpp49
-rw-r--r--src/routines/level2/xspr.cc2
-rw-r--r--src/routines/level2/xspr.hpp45
-rw-r--r--src/routines/level2/xspr2.cc2
-rw-r--r--src/routines/level2/xspr2.hpp46
-rw-r--r--src/routines/level2/xsymv.cc2
-rw-r--r--src/routines/level2/xsymv.hpp49
-rw-r--r--src/routines/level2/xsyr.cc2
-rw-r--r--src/routines/level2/xsyr.hpp45
-rw-r--r--src/routines/level2/xsyr2.cc2
-rw-r--r--src/routines/level2/xsyr2.hpp46
-rw-r--r--src/routines/level2/xtbmv.cc2
-rw-r--r--src/routines/level2/xtbmv.hpp49
-rw-r--r--src/routines/level2/xtpmv.cc2
-rw-r--r--src/routines/level2/xtpmv.hpp49
-rw-r--r--src/routines/level2/xtrmv.cc2
-rw-r--r--src/routines/level2/xtrmv.hpp49
-rw-r--r--src/routines/level3/xgemm.cc2
-rw-r--r--src/routines/level3/xgemm.hpp48
-rw-r--r--src/routines/level3/xhemm.cc2
-rw-r--r--src/routines/level3/xhemm.hpp54
-rw-r--r--src/routines/level3/xher2k.cc2
-rw-r--r--src/routines/level3/xher2k.hpp46
-rw-r--r--src/routines/level3/xherk.cc2
-rw-r--r--src/routines/level3/xherk.hpp45
-rw-r--r--src/routines/level3/xsymm.cc2
-rw-r--r--src/routines/level3/xsymm.hpp56
-rw-r--r--src/routines/level3/xsyr2k.cc2
-rw-r--r--src/routines/level3/xsyr2k.hpp46
-rw-r--r--src/routines/level3/xsyrk.cc2
-rw-r--r--src/routines/level3/xsyrk.hpp47
-rw-r--r--src/routines/level3/xtrmm.cc2
-rw-r--r--src/routines/level3/xtrmm.hpp54
-rw-r--r--src/routines/levelx/xomatcopy.cc2
-rw-r--r--src/routines/levelx/xomatcopy.hpp41
-rw-r--r--src/tuning/kernels/copy_fast.cc (renamed from src/tuning/copy_fast.cc)4
-rw-r--r--src/tuning/kernels/copy_pad.cc (renamed from src/tuning/copy_pad.cc)4
-rw-r--r--src/tuning/kernels/transpose_fast.cc (renamed from src/tuning/transpose_fast.cc)4
-rw-r--r--src/tuning/kernels/transpose_pad.cc (renamed from src/tuning/transpose_pad.cc)4
-rw-r--r--src/tuning/kernels/xaxpy.cc (renamed from src/tuning/xaxpy.cc)4
-rw-r--r--src/tuning/kernels/xdot.cc (renamed from src/tuning/xdot.cc)4
-rw-r--r--src/tuning/kernels/xgemm.cc (renamed from src/tuning/xgemm.cc)4
-rw-r--r--src/tuning/kernels/xgemv.cc (renamed from src/tuning/xgemv.cc)4
-rw-r--r--src/tuning/kernels/xger.cc (renamed from src/tuning/xger.cc)4
-rw-r--r--src/tuning/tuning.hpp161
-rw-r--r--src/utilities.cc2
-rw-r--r--src/utilities.hpp257
119 files changed, 6114 insertions, 122 deletions
diff --git a/src/buffer_test.hpp b/src/buffer_test.hpp
new file mode 100644
index 00000000..80f5243f
--- /dev/null
+++ b/src/buffer_test.hpp
@@ -0,0 +1,121 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are
+// templated and thus header-only.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_BUFFER_TEST_H_
+#define CLBLAST_BUFFER_TEST_H_
+
+#include "clblast.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Tests matrix 'A' for validity
+template <typename T>
+StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
+ const size_t offset, const size_t ld) {
+ if (ld < one) { return StatusCode::kInvalidLeadDimA; }
+ try {
+ const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
+ } catch (...) { return StatusCode::kInvalidMatrixA; }
+ return StatusCode::kSuccess;
+}
+
+// Tests matrix 'B' for validity
+template <typename T>
+StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
+ const size_t offset, const size_t ld) {
+ if (ld < one) { return StatusCode::kInvalidLeadDimB; }
+ try {
+ const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
+ } catch (...) { return StatusCode::kInvalidMatrixB; }
+ return StatusCode::kSuccess;
+}
+
+// Tests matrix 'C' for validity
+template <typename T>
+StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
+ const size_t offset, const size_t ld) {
+ if (ld < one) { return StatusCode::kInvalidLeadDimC; }
+ try {
+ const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
+ } catch (...) { return StatusCode::kInvalidMatrixC; }
+ return StatusCode::kSuccess;
+}
+
+// Tests matrix 'AP' for validity
+template <typename T>
+StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+ try {
+ const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
+ } catch (...) { return StatusCode::kInvalidMatrixA; }
+ return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Tests vector 'X' for validity
+template <typename T>
+StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
+ const size_t inc) {
+ if (inc == 0) { return StatusCode::kInvalidIncrementX; }
+ try {
+ const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
+ } catch (...) { return StatusCode::kInvalidVectorX; }
+ return StatusCode::kSuccess;
+}
+
+// Tests vector 'Y' for validity
+template <typename T>
+StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
+ const size_t inc) {
+ if (inc == 0) { return StatusCode::kInvalidIncrementY; }
+ try {
+ const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
+ } catch (...) { return StatusCode::kInvalidVectorY; }
+ return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Tests vector 'scalar' for validity
+template <typename T>
+StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+ try {
+ const auto required_size = (n + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
+ } catch (...) { return StatusCode::kInvalidVectorScalar; }
+ return StatusCode::kSuccess;
+}
+
+// Tests vector 'index' for validity
+template <typename T>
+StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+ try {
+ const auto required_size = (n + offset) * sizeof(T);
+ if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
+ } catch (...) { return StatusCode::kInvalidVectorScalar; }
+ return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_BUFFER_TEST_H_
+#endif
diff --git a/src/cache.cc b/src/cache.cc
index a34d351f..cd9055d0 100644
--- a/src/cache.cc
+++ b/src/cache.cc
@@ -15,7 +15,7 @@
#include <vector>
#include <mutex>
-#include "internal/cache.h"
+#include "cache.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/cache.hpp b/src/cache.hpp
new file mode 100644
index 00000000..0d74d7bc
--- /dev/null
+++ b/src/cache.hpp
@@ -0,0 +1,98 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the caching functionality of compiled binaries and programs.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CACHE_H_
+#define CLBLAST_CACHE_H_
+
+#include <string>
+#include <vector>
+#include <mutex>
+
+#include "utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// The cache of compiled OpenCL binaries, along with some meta-data
+struct BinaryCache {
+ std::string binary;
+ std::string device_name;
+ Precision precision;
+ std::string routine_name_;
+
+ // Finds out whether the properties match
+ bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
+ const std::string &ref_routine) {
+ return (device_name == ref_device &&
+ precision == ref_precision &&
+ routine_name_ == ref_routine);
+ }
+};
+
+// The actual cache, implemented as a vector of the above data-type, and its mutex
+static std::vector<BinaryCache> binary_cache_;
+static std::mutex binary_cache_mutex_;
+
+// =================================================================================================
+
+// The cache of compiled OpenCL programs, along with some meta-data
+struct ProgramCache {
+ Program program;
+ ContextPointer context_ptr;
+ Precision precision;
+ std::string routine_name_;
+
+ // Finds out whether the properties match
+ bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision,
+ const std::string &ref_routine) {
+ return (context_ptr == ref_context &&
+ precision == ref_precision &&
+ routine_name_ == ref_routine);
+ }
+};
+
+// The actual cache, implemented as a vector of the above data-type, and its mutex
+static std::vector<ProgramCache> program_cache_;
+static std::mutex program_cache_mutex_;
+
+// =================================================================================================
+
+// Stores the compiled binary or program in the cache
+void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
+ const Precision &precision, const std::string &routine_name);
+void StoreProgramToCache(const Program &program, const Context &context,
+ const Precision &precision, const std::string &routine_name);
+
+// Queries the cache and retrieves a matching binary or program. Assumes that the match is
+// available, throws otherwise.
+const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
+ const std::string &routine_name);
+const Program& GetProgramFromCache(const Context &context, const Precision &precision,
+ const std::string &routine_name);
+
+// Queries the cache to see whether or not the compiled kernel is already there
+bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
+ const std::string &routine_name);
+bool ProgramIsInCache(const Context &context, const Precision &precision,
+ const std::string &routine_name);
+
+// =================================================================================================
+
+// Clears the cache of stored binaries
+StatusCode CacheClearAll();
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_CACHE_H_
+#endif
diff --git a/src/clblast.cc b/src/clblast.cc
index d0f0c937..88d60772 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -16,60 +16,60 @@
#include <string>
#include "clblast.h"
-#include "internal/public_api.h"
-#include "internal/cache.h"
+#include "public_api.hpp"
+#include "cache.hpp"
// BLAS level-1 includes
-#include "internal/routines/level1/xswap.h"
-#include "internal/routines/level1/xscal.h"
-#include "internal/routines/level1/xcopy.h"
-#include "internal/routines/level1/xaxpy.h"
-#include "internal/routines/level1/xdot.h"
-#include "internal/routines/level1/xdotu.h"
-#include "internal/routines/level1/xdotc.h"
-#include "internal/routines/level1/xnrm2.h"
-#include "internal/routines/level1/xasum.h"
-#include "internal/routines/level1/xsum.h" // non-BLAS routine
-#include "internal/routines/level1/xamax.h"
-#include "internal/routines/level1/xmax.h" // non-BLAS routine
-#include "internal/routines/level1/xmin.h" // non-BLAS routine
+#include "routines/level1/xswap.hpp"
+#include "routines/level1/xscal.hpp"
+#include "routines/level1/xcopy.hpp"
+#include "routines/level1/xaxpy.hpp"
+#include "routines/level1/xdot.hpp"
+#include "routines/level1/xdotu.hpp"
+#include "routines/level1/xdotc.hpp"
+#include "routines/level1/xnrm2.hpp"
+#include "routines/level1/xasum.hpp"
+#include "routines/level1/xsum.hpp" // non-BLAS routine
+#include "routines/level1/xamax.hpp"
+#include "routines/level1/xmax.hpp" // non-BLAS routine
+#include "routines/level1/xmin.hpp" // non-BLAS routine
// BLAS level-2 includes
-#include "internal/routines/level2/xgemv.h"
-#include "internal/routines/level2/xgbmv.h"
-#include "internal/routines/level2/xhemv.h"
-#include "internal/routines/level2/xhbmv.h"
-#include "internal/routines/level2/xhpmv.h"
-#include "internal/routines/level2/xsymv.h"
-#include "internal/routines/level2/xsbmv.h"
-#include "internal/routines/level2/xspmv.h"
-#include "internal/routines/level2/xtrmv.h"
-#include "internal/routines/level2/xtbmv.h"
-#include "internal/routines/level2/xtpmv.h"
-#include "internal/routines/level2/xger.h"
-#include "internal/routines/level2/xgeru.h"
-#include "internal/routines/level2/xgerc.h"
-#include "internal/routines/level2/xher.h"
-#include "internal/routines/level2/xhpr.h"
-#include "internal/routines/level2/xher2.h"
-#include "internal/routines/level2/xhpr2.h"
-#include "internal/routines/level2/xsyr.h"
-#include "internal/routines/level2/xspr.h"
-#include "internal/routines/level2/xsyr2.h"
-#include "internal/routines/level2/xspr2.h"
+#include "routines/level2/xgemv.hpp"
+#include "routines/level2/xgbmv.hpp"
+#include "routines/level2/xhemv.hpp"
+#include "routines/level2/xhbmv.hpp"
+#include "routines/level2/xhpmv.hpp"
+#include "routines/level2/xsymv.hpp"
+#include "routines/level2/xsbmv.hpp"
+#include "routines/level2/xspmv.hpp"
+#include "routines/level2/xtrmv.hpp"
+#include "routines/level2/xtbmv.hpp"
+#include "routines/level2/xtpmv.hpp"
+#include "routines/level2/xger.hpp"
+#include "routines/level2/xgeru.hpp"
+#include "routines/level2/xgerc.hpp"
+#include "routines/level2/xher.hpp"
+#include "routines/level2/xhpr.hpp"
+#include "routines/level2/xher2.hpp"
+#include "routines/level2/xhpr2.hpp"
+#include "routines/level2/xsyr.hpp"
+#include "routines/level2/xspr.hpp"
+#include "routines/level2/xsyr2.hpp"
+#include "routines/level2/xspr2.hpp"
// BLAS level-3 includes
-#include "internal/routines/level3/xgemm.h"
-#include "internal/routines/level3/xsymm.h"
-#include "internal/routines/level3/xhemm.h"
-#include "internal/routines/level3/xsyrk.h"
-#include "internal/routines/level3/xherk.h"
-#include "internal/routines/level3/xsyr2k.h"
-#include "internal/routines/level3/xher2k.h"
-#include "internal/routines/level3/xtrmm.h"
+#include "routines/level3/xgemm.hpp"
+#include "routines/level3/xsymm.hpp"
+#include "routines/level3/xhemm.hpp"
+#include "routines/level3/xsyrk.hpp"
+#include "routines/level3/xherk.hpp"
+#include "routines/level3/xsyr2k.hpp"
+#include "routines/level3/xher2k.hpp"
+#include "routines/level3/xtrmm.hpp"
// Level-x includes (non-BLAS)
-#include "internal/routines/levelx/xomatcopy.h"
+#include "routines/levelx/xomatcopy.hpp"
namespace clblast {
diff --git a/src/clblast_c.cc b/src/clblast_c.cc
index 22cb2192..9ea2c884 100644
--- a/src/clblast_c.cc
+++ b/src/clblast_c.cc
@@ -15,7 +15,7 @@
#include "clblast_c.h"
#include "clblast.h"
-#include "internal/utilities.h"
+#include "utilities.hpp"
// Shortcuts to the clblast namespace
using float2 = clblast::float2;
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
new file mode 100644
index 00000000..b834d8b4
--- /dev/null
+++ b/src/clpp11.hpp
@@ -0,0 +1,695 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API
+// calls. The main benefits are increased abstraction, automatic memory management, and portability.
+// Portability here means that a similar header exists for CUDA with the same classes and
+// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
+//
+// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore
+// contains the following header copyright notice:
+//
+// =================================================================================================
+//
+// Copyright 2015 SURFsara
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLPP11_H_
+#define CLBLAST_CLPP11_H_
+
+// C++
+#include <algorithm> // std::copy
+#include <string> // std::string
+#include <vector> // std::vector
+#include <memory> // std::shared_ptr
+#include <stdexcept> // std::runtime_error
+#include <numeric> // std::accumulate
+
+// OpenCL
+#if defined(__APPLE__) || defined(__MACOSX)
+ #include <OpenCL/opencl.h>
+#else
+ #include <CL/opencl.h>
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// Error occurred in the C++11 OpenCL header (this file)
+inline void Error(const std::string &message) {
+ throw std::runtime_error("Internal OpenCL error: "+message);
+}
+
+// Error occurred in OpenCL
+inline void CheckError(const cl_int status) {
+ if (status != CL_SUCCESS) {
+ throw std::runtime_error("Internal OpenCL error: "+std::to_string(status));
+ }
+}
+
+// =================================================================================================
+
+// C++11 version of 'cl_event'
+class Event {
+ public:
+
+ // Constructor based on the regular OpenCL data-type
+ explicit Event(const cl_event event): event_(event) { }
+
+ // Regular constructor
+ explicit Event(): event_(nullptr) { }
+
+ // Waits for completion of this event
+ void WaitForCompletion() const {
+ CheckError(clWaitForEvents(1, &event_));
+ }
+
+ // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
+ // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
+ // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
+ float GetElapsedTime() const {
+ WaitForCompletion();
+ auto bytes = size_t{0};
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
+ auto time_start = size_t{0};
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
+ auto time_end = size_t{0};
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
+ return (time_end - time_start) * 1.0e-6f;
+ }
+
+ // Accessor to the private data-member
+ cl_event& operator()() { return event_; }
+ cl_event* pointer() { return &event_; }
+ private:
+ cl_event event_;
+};
+
+// Pointer to an OpenCL event
+using EventPointer = cl_event*;
+
+// =================================================================================================
+
+// C++11 version of 'cl_platform_id'
+class Platform {
+ public:
+
+ // Constructor based on the regular OpenCL data-type
+ explicit Platform(const cl_platform_id platform): platform_(platform) { }
+
+ // Initializes the platform
+ explicit Platform(const size_t platform_id) {
+ auto num_platforms = cl_uint{0};
+ CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
+ if (num_platforms == 0) { Error("no platforms found"); }
+ auto platforms = std::vector<cl_platform_id>(num_platforms);
+ CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr));
+ if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
+ platform_ = platforms[platform_id];
+ }
+
+ // Returns the number of devices on this platform
+ size_t NumDevices() const {
+ auto result = cl_uint{0};
+ CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result));
+ return static_cast<size_t>(result);
+ }
+
+ // Accessor to the private data-member
+ const cl_platform_id& operator()() const { return platform_; }
+ private:
+ cl_platform_id platform_;
+};
+
+// =================================================================================================
+
+// C++11 version of 'cl_device_id'
+class Device {
+ public:
+
+ // Constructor based on the regular OpenCL data-type
+ explicit Device(const cl_device_id device): device_(device) { }
+
+ // Initialize the device. Note that this constructor can throw exceptions!
+ explicit Device(const Platform &platform, const size_t device_id) {
+ auto num_devices = platform.NumDevices();
+ if (num_devices == 0) { Error("no devices found"); }
+ auto devices = std::vector<cl_device_id>(num_devices);
+ CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
+ devices.data(), nullptr));
+ if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
+ device_ = devices[device_id];
+ }
+
+ // Methods to retrieve device information
+ std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); }
+ std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); }
+ std::string Name() const { return GetInfoString(CL_DEVICE_NAME); }
+ std::string Type() const {
+ auto type = GetInfo<cl_device_type>(CL_DEVICE_TYPE);
+ switch(type) {
+ case CL_DEVICE_TYPE_CPU: return "CPU";
+ case CL_DEVICE_TYPE_GPU: return "GPU";
+ case CL_DEVICE_TYPE_ACCELERATOR: return "accelerator";
+ default: return "default";
+ }
+ }
+ size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
+ size_t MaxWorkItemDimensions() const {
+ return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
+ }
+ std::vector<size_t> MaxWorkItemSizes() const {
+ return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
+ }
+ size_t LocalMemSize() const {
+ return static_cast<size_t>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
+ }
+ std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
+ size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
+ size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
+ size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
+ size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
+ size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
+ size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
+
+ // Configuration-validity checks
+ bool IsLocalMemoryValid(const size_t local_mem_usage) const {
+ return (local_mem_usage <= LocalMemSize());
+ }
+ bool IsThreadConfigValid(const std::vector<size_t> &local) const {
+ auto local_size = size_t{1};
+ for (const auto &item: local) { local_size *= item; }
+ for (auto i=size_t{0}; i<local.size(); ++i) {
+ if (local[i] > MaxWorkItemSizes()[i]) { return false; }
+ }
+ if (local_size > MaxWorkGroupSize()) { return false; }
+ if (local.size() > MaxWorkItemDimensions()) { return false; }
+ return true;
+ }
+
+ // Query for a specific type of device or brand
+ bool IsCPU() const { return Type() == "CPU"; }
+ bool IsGPU() const { return Type() == "GPU"; }
+ bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+ bool IsARM() const { return Vendor() == "ARM"; }
+
+ // Accessor to the private data-member
+ const cl_device_id& operator()() const { return device_; }
+ private:
+ cl_device_id device_;
+
+ // Private helper functions
+ template <typename T>
+ T GetInfo(const cl_device_info info) const {
+ auto bytes = size_t{0};
+ CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
+ auto result = T(0);
+ CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
+ return result;
+ }
+ size_t GetInfo(const cl_device_info info) const {
+ auto bytes = size_t{0};
+ CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
+ auto result = cl_uint(0);
+ CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
+ return static_cast<size_t>(result);
+ }
+ template <typename T>
+ std::vector<T> GetInfoVector(const cl_device_info info) const {
+ auto bytes = size_t{0};
+ CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
+ auto result = std::vector<T>(bytes/sizeof(T));
+ CheckError(clGetDeviceInfo(device_, info, bytes, result.data(), nullptr));
+ return result;
+ }
+ std::string GetInfoString(const cl_device_info info) const {
+ auto bytes = size_t{0};
+ CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
+ auto result = std::string{};
+ result.resize(bytes);
+ CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
+ return std::string{result.c_str()}; // Removes any trailing '\0'-characters
+ }
+};
+
+// =================================================================================================
+
+// C++11 version of 'cl_context'
+class Context {
+ public:
+
+ // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
+ explicit Context(const cl_context context):
+ context_(new cl_context) {
+ *context_ = context;
+ }
+
+ // Regular constructor with memory management
+ explicit Context(const Device &device):
+ context_(new cl_context, [](cl_context* c) { CheckError(clReleaseContext(*c)); delete c; }) {
+ auto status = CL_SUCCESS;
+ const cl_device_id dev = device();
+ *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
+ CheckError(status);
+ }
+
+ // Accessor to the private data-member
+ const cl_context& operator()() const { return *context_; }
+ cl_context* pointer() const { return &(*context_); }
+ private:
+ std::shared_ptr<cl_context> context_;
+};
+
+// Pointer to an OpenCL context
+using ContextPointer = cl_context*;
+
+// =================================================================================================
+
+// Enumeration of build statuses of the run-time compilation process
+enum class BuildStatus { kSuccess, kError, kInvalid };
+
+// C++11 version of 'cl_program'. Additionally holds the program's source code.
+class Program {
+ public:
+ // Note that there is no constructor based on the regular OpenCL data-type because of extra state
+
+ // Source-based constructor with memory management
+ explicit Program(const Context &context, std::string source):
+ program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
+ length_(source.length()),
+ source_(std::move(source)),
+ source_ptr_(&source_[0]) {
+ auto status = CL_SUCCESS;
+ *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
+ CheckError(status);
+ }
+
+ // Binary-based constructor with memory management
+ explicit Program(const Device &device, const Context &context, const std::string& binary):
+ program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
+ length_(binary.length()),
+ source_(binary),
+ source_ptr_(&source_[0]) {
+ auto status1 = CL_SUCCESS;
+ auto status2 = CL_SUCCESS;
+ const cl_device_id dev = device();
+ *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
+ reinterpret_cast<const unsigned char**>(&source_ptr_),
+ &status1, &status2);
+ CheckError(status1);
+ CheckError(status2);
+ }
+
+ // Compiles the device program and returns whether or not there where any warnings/errors
+ BuildStatus Build(const Device &device, std::vector<std::string> &options) {
+ auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
+ const cl_device_id dev = device();
+ auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr);
+ if (status == CL_BUILD_PROGRAM_FAILURE) {
+ return BuildStatus::kError;
+ }
+ else if (status == CL_INVALID_BINARY) {
+ return BuildStatus::kInvalid;
+ }
+ else {
+ CheckError(status);
+ return BuildStatus::kSuccess;
+ }
+ }
+
+ // Retrieves the warning/error message from the compiler (if any)
+ std::string GetBuildInfo(const Device &device) const {
+ auto bytes = size_t{0};
+ auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG};
+ CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes));
+ auto result = std::string{};
+ result.resize(bytes);
+ CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr));
+ return result;
+ }
+
+ // Retrieves a binary or an intermediate representation of the compiled program
+ std::string GetIR() const {
+ auto bytes = size_t{0};
+ CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
+ auto result = std::string{};
+ result.resize(bytes);
+ auto result_ptr = result.data();
+ CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
+ return result;
+ }
+
+ // Accessor to the private data-member
+ const cl_program& operator()() const { return *program_; }
+ private:
+ std::shared_ptr<cl_program> program_;
+ size_t length_;
+ std::string source_; // Note: the source can also be a binary or IR
+ const char* source_ptr_;
+};
+
+// =================================================================================================
+
+// C++11 version of 'cl_command_queue'
+class Queue {
+ public:
+
+ // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
+ explicit Queue(const cl_command_queue queue):
+ queue_(new cl_command_queue) {
+ *queue_ = queue;
+ }
+
+ // Regular constructor with memory management
+ explicit Queue(const Context &context, const Device &device):
+ queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
+ delete s; }) {
+ auto status = CL_SUCCESS;
+ #ifdef CL_VERSION_2_0
+ cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+ *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
+ #else
+ *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+ #endif
+ CheckError(status);
+ }
+
+ // Synchronizes the queue
+ void Finish(Event &) const {
+ Finish();
+ }
+ void Finish() const {
+ CheckError(clFinish(*queue_));
+ }
+
+ // Retrieves the corresponding context or device
+ Context GetContext() const {
+ auto bytes = size_t{0};
+ CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes));
+ cl_context result;
+ CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr));
+ return Context(result);
+ }
+ Device GetDevice() const {
+ auto bytes = size_t{0};
+ CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes));
+ cl_device_id result;
+ CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr));
+ return Device(result);
+ }
+
+ // Accessor to the private data-member
+ const cl_command_queue& operator()() const { return *queue_; }
+ private:
+ std::shared_ptr<cl_command_queue> queue_;
+};
+
+// =================================================================================================
+
+// C++11 version of host memory
+template <typename T>
+class BufferHost {
+ public:
+
+ // Regular constructor with memory management
+ explicit BufferHost(const Context &, const size_t size):
+ buffer_(new std::vector<T>(size)) {
+ }
+
+ // Retrieves the actual allocated size in bytes
+ size_t GetSize() const {
+ return buffer_->size()*sizeof(T);
+ }
+
+ // Compatibility with std::vector
+ size_t size() const { return buffer_->size(); }
+ T* begin() { return &(*buffer_)[0]; }
+ T* end() { return &(*buffer_)[buffer_->size()-1]; }
+ T& operator[](const size_t i) { return (*buffer_)[i]; }
+ T* data() { return buffer_->data(); }
+ const T* data() const { return buffer_->data(); }
+
+ private:
+ std::shared_ptr<std::vector<T>> buffer_;
+};
+
+// =================================================================================================
+
+// Enumeration of buffer access types
+enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
+
+// C++11 version of 'cl_mem'
+template <typename T>
+class Buffer {
+ public:
+
+ // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
+ explicit Buffer(const cl_mem buffer):
+ buffer_(new cl_mem),
+ access_(BufferAccess::kNotOwned) {
+ *buffer_ = buffer;
+ }
+
+ // Regular constructor with memory management. If this class does not own the buffer object, then
+ // the memory will not be freed automatically afterwards.
+ explicit Buffer(const Context &context, const BufferAccess access, const size_t size):
+ buffer_(new cl_mem, [access](cl_mem* m) {
+ if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); }
+ delete m;
+ }),
+ access_(access) {
+ auto flags = cl_mem_flags{CL_MEM_READ_WRITE};
+ if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; }
+ if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
+ auto status = CL_SUCCESS;
+ *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
+ CheckError(status);
+ }
+
+ // As above, but now with read/write access as a default
+ explicit Buffer(const Context &context, const size_t size):
+ Buffer<T>(context, BufferAccess::kReadWrite, size) {
+ }
+
+ // Constructs a new buffer based on an existing host-container
+ template <typename Iterator>
+ explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
+ Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
+ auto size = static_cast<size_t>(end - start);
+ auto pointer = &*start;
+ CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0,
+ nullptr, nullptr));
+ queue.Finish();
+ }
+
+ // Copies from device to host: reading the device buffer a-synchronously
+ void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
+ if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
+ CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
+ host, 0, nullptr, nullptr));
+ }
+ void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
+ const size_t offset = 0) const {
+ if (host.size() < size) { Error("target host buffer is too small"); }
+ ReadAsync(queue, size, host.data(), offset);
+ }
+ void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
+ const size_t offset = 0) const {
+ if (host.size() < size) { Error("target host buffer is too small"); }
+ ReadAsync(queue, size, host.data(), offset);
+ }
+
+ // Copies from device to host: reading the device buffer
+ void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
+ ReadAsync(queue, size, host, offset);
+ queue.Finish();
+ }
+ void Read(const Queue &queue, const size_t size, std::vector<T> &host,
+ const size_t offset = 0) const {
+ Read(queue, size, host.data(), offset);
+ }
+ void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
+ const size_t offset = 0) const {
+ Read(queue, size, host.data(), offset);
+ }
+
+ // Copies from host to device: writing the device buffer a-synchronously
+ void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
+ if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
+ if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
+ CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
+ host, 0, nullptr, nullptr));
+ }
+ void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
+ const size_t offset = 0) {
+ WriteAsync(queue, size, host.data(), offset);
+ }
+ void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
+ const size_t offset = 0) {
+ WriteAsync(queue, size, host.data(), offset);
+ }
+
+ // Copies from host to device: writing the device buffer
+ void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
+ WriteAsync(queue, size, host, offset);
+ queue.Finish();
+ }
+ void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
+ const size_t offset = 0) {
+ Write(queue, size, host.data(), offset);
+ }
+ void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
+ const size_t offset = 0) {
+ Write(queue, size, host.data(), offset);
+ }
+
+ // Copies the contents of this buffer into another device buffer
+ void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
+ CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0,
+ nullptr, nullptr));
+ }
+ void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
+ CopyToAsync(queue, size, destination);
+ queue.Finish();
+ }
+
+ // Retrieves the actual allocated size in bytes
+ size_t GetSize() const {
+ auto bytes = size_t{0};
+ CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes));
+ auto result = size_t{0};
+ CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
+ return result;
+ }
+
+ // Accessor to the private data-member
+ const cl_mem& operator()() const { return *buffer_; }
+ private:
+ std::shared_ptr<cl_mem> buffer_;
+ const BufferAccess access_;
+};
+
+// =================================================================================================
+
+// C++11 version of 'cl_kernel'
+class Kernel {
+ public:
+
+ // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
+ explicit Kernel(const cl_kernel kernel):
+ kernel_(new cl_kernel) {
+ *kernel_ = kernel;
+ }
+
+ // Regular constructor with memory management
+ explicit Kernel(const Program &program, const std::string &name):
+ kernel_(new cl_kernel, [](cl_kernel* k) { CheckError(clReleaseKernel(*k)); delete k; }) {
+ auto status = CL_SUCCESS;
+ *kernel_ = clCreateKernel(program(), name.c_str(), &status);
+ CheckError(status);
+ }
+
+ // Sets a kernel argument at the indicated position
+ template <typename T>
+ void SetArgument(const size_t index, const T &value) {
+ CheckError(clSetKernelArg(*kernel_, static_cast<cl_uint>(index), sizeof(T), &value));
+ }
+ template <typename T>
+ void SetArgument(const size_t index, Buffer<T> &value) {
+ SetArgument(index, value());
+ }
+
+ // Sets all arguments in one go using parameter packs. Note that this overwrites previously set
+ // arguments using 'SetArgument' or 'SetArguments'.
+ template <typename... Args>
+ void SetArguments(Args&... args) {
+ SetArgumentsRecursive(0, args...);
+ }
+
+ // Retrieves the amount of local memory used per work-group for this kernel
+ size_t LocalMemUsage(const Device &device) const {
+ auto bytes = size_t{0};
+ auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
+ CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
+ auto result = size_t{0};
+ CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
+ return result;
+ }
+
+ // Launches a kernel onto the specified queue
+ void Launch(const Queue &queue, const std::vector<size_t> &global,
+ const std::vector<size_t> &local, EventPointer event) {
+ CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
+ nullptr, global.data(), local.data(),
+ 0, nullptr, event));
+ }
+
+ // As above, but with an event waiting list
+ void Launch(const Queue &queue, const std::vector<size_t> &global,
+ const std::vector<size_t> &local, EventPointer event,
+ std::vector<Event>& waitForEvents) {
+ if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
+
+ // Builds a plain version of the events waiting list
+ auto waitForEventsPlain = std::vector<cl_event>();
+ for (auto &waitEvent : waitForEvents) {
+ waitForEventsPlain.push_back(waitEvent());
+ }
+
+ // Launches the kernel while waiting for other events
+ CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
+ nullptr, global.data(), local.data(),
+ static_cast<cl_uint>(waitForEventsPlain.size()),
+ waitForEventsPlain.data(),
+ event));
+ }
+
+ // As above, but with the default local workgroup size
+ void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
+ CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
+ nullptr, global.data(), nullptr,
+ 0, nullptr, event));
+ }
+
+ // Accessor to the private data-member
+ const cl_kernel& operator()() const { return *kernel_; }
+ private:
+ std::shared_ptr<cl_kernel> kernel_;
+
+ // Internal implementation for the recursive SetArguments function.
+ template <typename T>
+ void SetArgumentsRecursive(const size_t index, T &first) {
+ SetArgument(index, first);
+ }
+ template <typename T, typename... Args>
+ void SetArgumentsRecursive(const size_t index, T &first, Args&... args) {
+ SetArgument(index, first);
+ SetArgumentsRecursive(index+1, args...);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_CLPP11_H_
+#endif
diff --git a/src/database.cc b/src/database/database.cc
index e20ae340..6ec93731 100644
--- a/src/database.cc
+++ b/src/database/database.cc
@@ -11,18 +11,18 @@
//
// =================================================================================================
-#include "internal/database.h"
-#include "internal/database/xaxpy.h"
-#include "internal/database/xdot.h"
-#include "internal/database/xgemv.h"
-#include "internal/database/xger.h"
-#include "internal/database/xgemm.h"
-#include "internal/database/copy.h"
-#include "internal/database/pad.h"
-#include "internal/database/transpose.h"
-#include "internal/database/padtranspose.h"
-
-#include "internal/utilities.h"
+#include "utilities.hpp"
+
+#include "database/database.hpp"
+#include "database/kernels/xaxpy.hpp"
+#include "database/kernels/xdot.hpp"
+#include "database/kernels/xgemv.hpp"
+#include "database/kernels/xger.hpp"
+#include "database/kernels/xgemm.hpp"
+#include "database/kernels/copy.hpp"
+#include "database/kernels/pad.hpp"
+#include "database/kernels/transpose.hpp"
+#include "database/kernels/padtranspose.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/database/database.hpp b/src/database/database.hpp
new file mode 100644
index 00000000..0987cbed
--- /dev/null
+++ b/src/database/database.hpp
@@ -0,0 +1,104 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Database class, providing a static variable holding the actual database
+// information. The class also provides utility functions to search the database and to access a
+// found entry by parameter-key. The database itself is filled in the corresponding source-file and
+// partially also by the database/xxxxx.h files, in which kernel-specific parameters are found.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_DATABASE_H_
+#define CLBLAST_DATABASE_H_
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include "utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+class Database {
+ public:
+
+ // Type alias for the database parameters
+ using Parameters = std::unordered_map<std::string,size_t>;
+
+ // Structures for content inside the database
+ struct DatabaseDevice {
+ const std::string name;
+ const Parameters parameters;
+ };
+ struct DatabaseVendor {
+ const std::string type;
+ const std::string name;
+ const std::vector<DatabaseDevice> devices;
+ };
+ struct DatabaseEntry {
+ const std::string kernel;
+ const Precision precision;
+ const std::vector<DatabaseVendor> vendors;
+ };
+
+ // The OpenCL device types
+ static constexpr auto kDeviceTypeCPU = "CPU";
+ static constexpr auto kDeviceTypeGPU = "GPU";
+ static constexpr auto kDeviceTypeAccelerator = "accelerator";
+ static constexpr auto kDeviceTypeAll = "default";
+
+ // The OpenCL device vendors
+ static constexpr auto kDeviceVendorAll = "default";
+
+ // Alternative names for some OpenCL vendors
+ const std::unordered_map<std::string,std::string> kVendorNames {
+ {"Intel(R) Corporation", "Intel"},
+ {"GenuineIntel", "Intel"},
+ {"Advanced Micro Devices, Inc.", "AMD"},
+ {"NVIDIA Corporation", "NVIDIA"},
+ };
+
+ // The database consists of separate database entries, stored together in a vector
+ static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
+ static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
+ static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
+ static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
+ static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
+ static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
+ static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
+ static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
+ static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
+ static const std::vector<DatabaseEntry> database;
+
+ // The constructor
+ explicit Database(const Queue &queue, const std::vector<std::string> &routines,
+ const Precision precision);
+
+ // Accessor of values by key
+ size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
+
+ // Obtain a list of OpenCL pre-processor defines based on the parameters
+ std::string GetDefines() const;
+
+ private:
+ Parameters Search(const std::string &this_kernel, const std::string &this_type,
+ const std::string &this_vendor, const std::string &this_device,
+ const Precision this_precision) const;
+
+ // Found parameters suitable for this device/kernel
+ Parameters parameters_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_DATABASE_H_
+#endif
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
new file mode 100644
index 00000000..201e8b8a
--- /dev/null
+++ b/src/database/kernels/copy.hpp
@@ -0,0 +1,262 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Copy' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyHalf = {
+ "Copy", Precision::kHalf, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopySingle = {
+ "Copy", Precision::kSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+ { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+ { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
+ { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ { "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ { "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
+ { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
+ { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyComplexSingle = {
+ "Copy", Precision::kComplexSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+ { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+ { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
+ { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+ { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
+ { "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyDouble = {
+ "Copy", Precision::kDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+ { "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } },
+ { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+ { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyComplexDouble = {
+ "Copy", Precision::kComplexDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
+ { "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+ { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
new file mode 100644
index 00000000..cc703dd6
--- /dev/null
+++ b/src/database/kernels/pad.hpp
@@ -0,0 +1,270 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Pad' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadHalf = {
+ "Pad", Precision::kHalf, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadSingle = {
+ "Pad", Precision::kSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+ { "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+ { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadComplexSingle = {
+ "Pad", Precision::kComplexSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
+ { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
+ { "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadDouble = {
+ "Pad", Precision::kDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 680", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "GeForce GTX 750 Ti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 980", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadComplexDouble = {
+ "Pad", Precision::kComplexDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
new file mode 100644
index 00000000..f3b1f262
--- /dev/null
+++ b/src/database/kernels/padtranspose.hpp
@@ -0,0 +1,270 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Padtranspose' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadtransposeHalf = {
+ "Padtranspose", Precision::kHalf, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadtransposeSingle = {
+ "Padtranspose", Precision::kSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+ { "Hawaii", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+ { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+ { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
+ { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
+ { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+ { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
+ "Padtranspose", Precision::kComplexSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+ { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+ { "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+ { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadtransposeDouble = {
+ "Padtranspose", Precision::kDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+ { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
+ { "GeForce GTX 980", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+ { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
+ "Padtranspose", Precision::kComplexDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+ { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+ { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
new file mode 100644
index 00000000..0c893dae
--- /dev/null
+++ b/src/database/kernels/transpose.hpp
@@ -0,0 +1,258 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Transpose' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TransposeHalf = {
+ "Transpose", Precision::kHalf, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TransposeSingle = {
+ "Transpose", Precision::kSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+ { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+ { "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+ { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TransposeComplexSingle = {
+ "Transpose", Precision::kComplexSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+ { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TransposeDouble = {
+ "Transpose", Precision::kDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+ { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "default", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TransposeComplexDouble = {
+ "Transpose", Precision::kComplexDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+ { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX 980", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
new file mode 100644
index 00000000..6e6719e8
--- /dev/null
+++ b/src/database/kernels/xaxpy.hpp
@@ -0,0 +1,270 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xaxpy' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyHalf = {
+ "Xaxpy", Precision::kHalf, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
+ { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpySingle = {
+ "Xaxpy", Precision::kSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",2} } },
+ { "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
+ { "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+ { "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",1}, {"WGS",512}, {"WPT",2} } },
+ { "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",1024}, {"WPT",2} } },
+ { "default", { {"VW",2}, {"WGS",1024}, {"WPT",2} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+ { "GeForce GTX 480", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
+ { "GeForce GTX 680", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+ { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+ { "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+ { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
+ { "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyComplexSingle = {
+ "Xaxpy", Precision::kComplexSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } },
+ { "Hawaii", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
+ { "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
+ { "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
+ { "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
+ { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+ { "GeForce GTX 480", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+ { "GeForce GTX 680", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+ { "GeForce GTX 980", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+ { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+ { "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ { "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyDouble = {
+ "Xaxpy", Precision::kDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+ { "Hawaii", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
+ { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"VW",2}, {"WGS",128}, {"WPT",2} } },
+ { "default", { {"VW",2}, {"WGS",128}, {"WPT",2} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
+ { "default", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "GeForce GTX 480", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+ { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+ { "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
+ { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+ { "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
+ { "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyComplexDouble = {
+ "Xaxpy", Precision::kComplexDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+ { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ { "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"VW",1}, {"WGS",64}, {"WPT",8} } },
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",8} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",8}, {"WGS",128}, {"WPT",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+ { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
+ { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+ { "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
+ { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+ { "Tesla K20m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "Tesla K40m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
new file mode 100644
index 00000000..d09d8c62
--- /dev/null
+++ b/src/database/kernels/xdot.hpp
@@ -0,0 +1,200 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xdot' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XdotHalf = {
+ "Xdot", Precision::kHalf, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
+ { "default", { {"WGS1",32}, {"WGS2",32} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",32}, {"WGS2",32} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XdotSingle = {
+ "Xdot", Precision::kSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } },
+ { "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
+ { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
+ { "Tahiti", { {"WGS1",128}, {"WGS2",32} } },
+ { "default", { {"WGS1",128}, {"WGS2",32} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
+ { "default", { {"WGS1",1024}, {"WGS2",32} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",32} } },
+ { "Iris Pro", { {"WGS1",512}, {"WGS2",64} } },
+ { "default", { {"WGS1",64}, {"WGS2",32} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"WGS1",128}, {"WGS2",32} } },
+ { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
+ { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } },
+ { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } },
+ { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
+ { "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } },
+ { "default", { {"WGS1",128}, {"WGS2",32} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",64}, {"WGS2",32} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XdotComplexSingle = {
+ "Xdot", Precision::kComplexSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } },
+ { "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
+ { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
+ { "Tahiti", { {"WGS1",64}, {"WGS2",32} } },
+ { "default", { {"WGS1",64}, {"WGS2",32} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
+ { "default", { {"WGS1",1024}, {"WGS2",32} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
+ { "Iris Pro", { {"WGS1",32}, {"WGS2",32} } },
+ { "default", { {"WGS1",32}, {"WGS2",32} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"WGS1",64}, {"WGS2",32} } },
+ { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
+ { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
+ { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } },
+ { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
+ { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
+ { "default", { {"WGS1",64}, {"WGS2",32} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",32}, {"WGS2",32} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XdotDouble = {
+ "Xdot", Precision::kDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } },
+ { "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
+ { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
+ { "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
+ { "default", { {"WGS1",64}, {"WGS2",32} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",64} } },
+ { "default", { {"WGS1",512}, {"WGS2",64} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"WGS1",128}, {"WGS2",32} } },
+ { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
+ { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
+ { "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } },
+ { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
+ { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
+ { "default", { {"WGS1",128}, {"WGS2",32} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",64}, {"WGS2",32} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XdotComplexDouble = {
+ "Xdot", Precision::kComplexDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } },
+ { "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
+ { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
+ { "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
+ { "default", { {"WGS1",64}, {"WGS2",32} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
+ { "default", { {"WGS1",1024}, {"WGS2",32} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"WGS1",64}, {"WGS2",32} } },
+ { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
+ { "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } },
+ { "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } },
+ { "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } },
+ { "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } },
+ { "default", { {"WGS1",64}, {"WGS2",32} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",64}, {"WGS2",32} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
new file mode 100644
index 00000000..f35d2c88
--- /dev/null
+++ b/src/database/kernels/xgemm.hpp
@@ -0,0 +1,263 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xgemm' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmHalf = {
+ "Xgemm", Precision::kHalf, {
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmSingle = {
+ "Xgemm", Precision::kSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
+ { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
+ { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
+ { "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
+ { "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+ { "default", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+ { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+ { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
+ { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
+ { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
+ { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+ { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
+ { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+ { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmComplexSingle = {
+ "Xgemm", Precision::kComplexSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+ { "Hawaii", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
+ { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
+ { "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+ { "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+ { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+ { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+ { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDouble = {
+ "Xgemm", Precision::kDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+ { "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+ { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+ { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+ { "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+ { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+ { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmComplexDouble = {
+ "Xgemm", Precision::kComplexDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+ { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } },
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
+ { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+ { "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
new file mode 100644
index 00000000..6b76c8ac
--- /dev/null
+++ b/src/database/kernels/xgemv.hpp
@@ -0,0 +1,231 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xgemv' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvHalf = {
+ "Xgemv", Precision::kHalf, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvSingle = {
+ "Xgemv", Precision::kSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",4} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+ { "Iris", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
+ { "Iris Pro", { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"WGS1",256}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+ { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+ { "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
+ { "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+ { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+ { "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+ { "Tesla K20m", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+ { "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvComplexSingle = {
+ "Xgemv", Precision::kComplexSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+ { "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "Pitcairn", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+ { "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+ { "Iris", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "Iris Pro", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+ { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvDouble = {
+ "Xgemv", Precision::kDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+ { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+ { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",2} } },
+ { "default", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
+ { "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+ { "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+ { "Tesla K20m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvComplexDouble = {
+ "Xgemv", Precision::kComplexDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+ { "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+ { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+ }
+ },
+ { // Intel accelerators
+ kDeviceTypeAccelerator, "Intel", {
+ { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+ { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
new file mode 100644
index 00000000..f2e0a36f
--- /dev/null
+++ b/src/database/kernels/xger.hpp
@@ -0,0 +1,220 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xger' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgerHalf = {
+ "Xger", Precision::kHalf, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+ { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgerSingle = {
+ "Xger", Precision::kSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
+ { "Hawaii", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+ { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+ { "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
+ { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } },
+ { "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
+ { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
+ { "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
+ { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",4} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
+ { "GeForce GTX 480", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+ { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
+ { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+ { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgerComplexSingle = {
+ "Xger", Precision::kComplexSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
+ { "Hawaii", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
+ { "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
+ { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+ { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
+ { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
+ { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
+ { "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
+ { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
+ { "GeForce GTX 480", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
+ { "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+ { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
+ { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgerDouble = {
+ "Xger", Precision::kDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+ { "Hawaii", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+ { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+ { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+ { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
+ { "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
+ { "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } },
+ { "GeForce GTX 480", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+ { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
+ { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
+ { "default", { {"WGS1",16}, {"WGS2",4}, {"WPT",2} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgerComplexDouble = {
+ "Xger", Precision::kComplexDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+ { "Hawaii", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
+ { "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
+ { "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
+ { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
+ }
+ },
+ { // ARM GPUs
+ kDeviceTypeGPU, "ARM", {
+ { "Mali-T628", { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } },
+ { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } },
+ }
+ },
+ { // Intel CPUs
+ kDeviceTypeCPU, "Intel", {
+ { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
+ { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+ { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GRID K520", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
+ { "GeForce GTX 480", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
+ { "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
+ { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+ { "default", { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/public_api.hpp b/src/public_api.hpp
new file mode 100644
index 00000000..d0732297
--- /dev/null
+++ b/src/public_api.hpp
@@ -0,0 +1,34 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides macro's to define the public API. This is needed when building a Windows DLL.
+// Note: this is only used for the C++ interface, the C interface has its own definition included in
+// the header file itself.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_PUBLIC_API_H_
+#define CLBLAST_PUBLIC_API_H_
+
+namespace clblast {
+// =================================================================================================
+
+// Exports library functions under Windows when building a DLL. See also:
+// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
+#ifdef _WIN32
+ #define PUBLIC_API __declspec(dllexport)
+#else
+ #define PUBLIC_API
+#endif
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_PUBLIC_API_H_
+#endif
diff --git a/src/routine.cc b/src/routine.cc
index 11633ede..d3590896 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -14,7 +14,7 @@
#include <string>
#include <vector>
-#include "internal/routine.h"
+#include "routine.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/routine.hpp b/src/routine.hpp
new file mode 100644
index 00000000..54b5779f
--- /dev/null
+++ b/src/routine.hpp
@@ -0,0 +1,68 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements all the basic functionality for the BLAS routines. This class serves as a
+// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
+// compiling the OpenCL kernel, connecting to the database, etc.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINE_H_
+#define CLBLAST_ROUTINE_H_
+
+#include <string>
+#include <vector>
+
+#include "utilities.hpp"
+#include "cache.hpp"
+#include "buffer_test.hpp"
+#include "database/database.hpp"
+#include "routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+class Routine {
+ public:
+
+ // Base class constructor
+ explicit Routine(Queue &queue, EventPointer event, const std::string &name,
+ const std::vector<std::string> &routines, const Precision precision);
+
+ // Set-up phase of the kernel
+ StatusCode SetUp();
+
+ protected:
+
+ // Non-static variable for the precision
+ const Precision precision_;
+
+ // The routine's name and its kernel-source in string form
+ const std::string routine_name_;
+ std::string source_string_;
+
+ // The OpenCL objects, accessible only from derived classes
+ Queue queue_;
+ EventPointer event_;
+ const Context context_;
+ const Device device_;
+
+ // OpenCL device properties
+ const std::string device_name_;
+
+ // Connection to the database for all the device-specific parameters
+ const Database db_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINE_H_
+#endif
diff --git a/src/routines/common.cc b/src/routines/common.cc
index 561a1bd8..c378df28 100644
--- a/src/routines/common.cc
+++ b/src/routines/common.cc
@@ -13,7 +13,7 @@
#include <vector>
-#include "internal/routines/common.h"
+#include "routines/common.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
new file mode 100644
index 00000000..c99cd39d
--- /dev/null
+++ b/src/routines/common.hpp
@@ -0,0 +1,173 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the interfaces to common kernels, such as copying, padding, and
+// transposing a matrix. These functions are templated and thus header-only. This file also contains
+// other common functions to routines, such as a function to launch a kernel.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_COMMON_H_
+#define CLBLAST_ROUTINES_COMMON_H_
+
+#include <string>
+#include <vector>
+
+#include "clblast.h"
+#include "clpp11.hpp"
+#include "database/database.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Enqueues a kernel, waits for completion, and checks for errors
+StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+ std::vector<size_t> global, const std::vector<size_t> &local,
+ EventPointer event, std::vector<Event>& waitForEvents);
+
+// As above, but without an event waiting list
+StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+ std::vector<size_t> global, const std::vector<size_t> &local,
+ EventPointer event);
+
+// =================================================================================================
+
+// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
+// to write to symmetric and triangular matrices through optional arguments.
+template <typename T>
+StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context,
+ const Database &db,
+ EventPointer event, std::vector<Event>& waitForEvents,
+ const size_t src_one, const size_t src_two,
+ const size_t src_ld, const size_t src_offset,
+ const Buffer<T> &src,
+ const size_t dest_one, const size_t dest_two,
+ const size_t dest_ld, const size_t dest_offset,
+ const Buffer<T> &dest,
+ const T alpha,
+ const Program &program, const bool do_pad,
+ const bool do_transpose, const bool do_conjugate,
+ const bool upper = false, const bool lower = false,
+ const bool diagonal_imag_zero = false) {
+
+ // Determines whether or not the fast-version could potentially be used
+ auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
+ (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
+ (upper == false) && (lower == false) && (diagonal_imag_zero == false);
+
+ // Determines the right kernel
+ auto kernel_name = std::string{};
+ if (do_transpose) {
+ if (use_fast_kernel &&
+ IsMultiple(src_ld, db["TRA_WPT"]) &&
+ IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
+ IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
+ kernel_name = "TransposeMatrixFast";
+ }
+ else {
+ use_fast_kernel = false;
+ kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
+ }
+ }
+ else {
+ if (use_fast_kernel &&
+ IsMultiple(src_ld, db["COPY_VW"]) &&
+ IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) &&
+ IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) {
+ kernel_name = "CopyMatrixFast";
+ }
+ else {
+ use_fast_kernel = false;
+ kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
+ }
+ }
+
+ // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+ auto alpha_buffer = Buffer<T>(context, 1);
+ alpha_buffer.Write(queue, 1, &alpha);
+
+ // Retrieves the kernel from the compiled binary
+ try {
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ if (use_fast_kernel) {
+ kernel.SetArgument(0, static_cast<int>(src_ld));
+ kernel.SetArgument(1, src());
+ kernel.SetArgument(2, dest());
+ kernel.SetArgument(3, alpha_buffer());
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(src_one));
+ kernel.SetArgument(1, static_cast<int>(src_two));
+ kernel.SetArgument(2, static_cast<int>(src_ld));
+ kernel.SetArgument(3, static_cast<int>(src_offset));
+ kernel.SetArgument(4, src());
+ kernel.SetArgument(5, static_cast<int>(dest_one));
+ kernel.SetArgument(6, static_cast<int>(dest_two));
+ kernel.SetArgument(7, static_cast<int>(dest_ld));
+ kernel.SetArgument(8, static_cast<int>(dest_offset));
+ kernel.SetArgument(9, dest());
+ kernel.SetArgument(10, alpha_buffer());
+ if (do_pad) {
+ kernel.SetArgument(11, static_cast<int>(do_conjugate));
+ }
+ else {
+ kernel.SetArgument(11, static_cast<int>(upper));
+ kernel.SetArgument(12, static_cast<int>(lower));
+ kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
+ }
+ }
+
+ // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+ // parameters in the database.
+ if (do_transpose) {
+ if (use_fast_kernel) {
+ const auto global = std::vector<size_t>{
+ dest_one / db["TRA_WPT"],
+ dest_two / db["TRA_WPT"]
+ };
+ const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
+ return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+ }
+ else {
+ const auto global = std::vector<size_t>{
+ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+ Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
+ };
+ const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
+ return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+ }
+ }
+ else {
+ if (use_fast_kernel) {
+ const auto global = std::vector<size_t>{
+ dest_one / db["COPY_VW"],
+ dest_two / db["COPY_WPT"]
+ };
+ const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
+ return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+ }
+ else {
+ const auto global = std::vector<size_t>{
+ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
+ Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
+ };
+ const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
+ return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+ }
+ }
+ } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_COMMON_H_
+#endif
diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc
index b4add2a3..6b6e7f9e 100644
--- a/src/routines/level1/xamax.cc
+++ b/src/routines/level1/xamax.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level1/xamax.h"
+#include "routines/level1/xamax.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level1/xamax.hpp b/src/routines/level1/xamax.hpp
new file mode 100644
index 00000000..aa45a8e4
--- /dev/null
+++ b/src/routines/level1/xamax.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xamax routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAMAX_H_
+#define CLBLAST_ROUTINES_XAMAX_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xamax: public Routine {
+ public:
+
+ // Constructor
+ Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoAmax(const size_t n,
+ const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAMAX_H_
+#endif
diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cc
index 80f04829..0c1ce903 100644
--- a/src/routines/level1/xasum.cc
+++ b/src/routines/level1/xasum.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level1/xasum.h"
+#include "routines/level1/xasum.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level1/xasum.hpp b/src/routines/level1/xasum.hpp
new file mode 100644
index 00000000..5a253f4d
--- /dev/null
+++ b/src/routines/level1/xasum.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xasum routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XASUM_H_
+#define CLBLAST_ROUTINES_XASUM_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xasum: public Routine {
+ public:
+
+ // Constructor
+ Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoAsum(const size_t n,
+ const Buffer<T> &asum_buffer, const size_t asum_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XASUM_H_
+#endif
diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc
index 4a548757..5b6c9e77 100644
--- a/src/routines/level1/xaxpy.cc
+++ b/src/routines/level1/xaxpy.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level1/xaxpy.h"
+#include "routines/level1/xaxpy.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level1/xaxpy.hpp b/src/routines/level1/xaxpy.hpp
new file mode 100644
index 00000000..caac871e
--- /dev/null
+++ b/src/routines/level1/xaxpy.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAXPY_H_
+#define CLBLAST_ROUTINES_XAXPY_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xaxpy: public Routine {
+ public:
+
+ // Constructor
+ Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoAxpy(const size_t n, const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAXPY_H_
+#endif
diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc
index 92d31786..673ef349 100644
--- a/src/routines/level1/xcopy.cc
+++ b/src/routines/level1/xcopy.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level1/xcopy.h"
+#include "routines/level1/xcopy.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level1/xcopy.hpp b/src/routines/level1/xcopy.hpp
new file mode 100644
index 00000000..0c424ba3
--- /dev/null
+++ b/src/routines/level1/xcopy.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xcopy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XCOPY_H_
+#define CLBLAST_ROUTINES_XCOPY_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xcopy: public Routine {
+ public:
+
+ // Constructor
+ Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoCopy(const size_t n,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XCOPY_H_
+#endif
diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc
index 8709c541..bafea157 100644
--- a/src/routines/level1/xdot.cc
+++ b/src/routines/level1/xdot.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level1/xdot.h"
+#include "routines/level1/xdot.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level1/xdot.hpp b/src/routines/level1/xdot.hpp
new file mode 100644
index 00000000..02c1efaa
--- /dev/null
+++ b/src/routines/level1/xdot.hpp
@@ -0,0 +1,42 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xdot routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XDOT_H_
+#define CLBLAST_ROUTINES_XDOT_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xdot: public Routine {
+ public:
+
+ // Constructor
+ Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoDot(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const bool do_conjugate = false);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XDOT_H_
+#endif
diff --git a/src/routines/level1/xdotc.cc b/src/routines/level1/xdotc.cc
index b3a01079..27cf2bab 100644
--- a/src/routines/level1/xdotc.cc
+++ b/src/routines/level1/xdotc.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level1/xdotc.h"
+#include "routines/level1/xdotc.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level1/xdotc.hpp b/src/routines/level1/xdotc.hpp
new file mode 100644
index 00000000..b8cbdaf5
--- /dev/null
+++ b/src/routines/level1/xdotc.hpp
@@ -0,0 +1,44 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xdotc routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XDOTC_H_
+#define CLBLAST_ROUTINES_XDOTC_H_
+
+#include "routines/level1/xdot.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xdotc: public Xdot<T> {
+ public:
+
+ // Uses the regular Xdot routine
+ using Xdot<T>::DoDot;
+
+ // Constructor
+ Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoDotc(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XDOTC_H_
+#endif
diff --git a/src/routines/level1/xdotu.cc b/src/routines/level1/xdotu.cc
index 8dded6e0..0bce70b7 100644
--- a/src/routines/level1/xdotu.cc
+++ b/src/routines/level1/xdotu.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level1/xdotu.h"
+#include "routines/level1/xdotu.hpp"
#include <string>
diff --git a/src/routines/level1/xdotu.hpp b/src/routines/level1/xdotu.hpp
new file mode 100644
index 00000000..b3f73086
--- /dev/null
+++ b/src/routines/level1/xdotu.hpp
@@ -0,0 +1,44 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xdotu routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XDOTU_H_
+#define CLBLAST_ROUTINES_XDOTU_H_
+
+#include "routines/level1/xdot.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xdotu: public Xdot<T> {
+ public:
+
+ // Uses the regular Xdot routine
+ using Xdot<T>::DoDot;
+
+ // Constructor
+ Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoDotu(const size_t n,
+ const Buffer<T> &dot_buffer, const size_t dot_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XDOTU_H_
+#endif
diff --git a/src/routines/level1/xmax.hpp b/src/routines/level1/xmax.hpp
new file mode 100644
index 00000000..5a0236f2
--- /dev/null
+++ b/src/routines/level1/xmax.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xmax routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XMAX_H_
+#define CLBLAST_ROUTINES_XMAX_H_
+
+#include "routine.hpp"
+#include "routines/level1/xamax.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xmax: public Xamax<T> {
+ public:
+
+ // Members and methods from the base class
+ using Xamax<T>::DoAmax;
+
+ // Constructor
+ Xmax(Queue &queue, EventPointer event, const std::string &name = "MAX"):
+ Xamax<T>(queue, event, name) {
+ }
+
+ // Forwards to the regular absolute version. The implementation difference is realised in the
+ // kernel through a pre-processor macro based on the name of the routine.
+ StatusCode DoMax(const size_t n,
+ const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XMAX_H_
+#endif
diff --git a/src/routines/level1/xmin.hpp b/src/routines/level1/xmin.hpp
new file mode 100644
index 00000000..6befec64
--- /dev/null
+++ b/src/routines/level1/xmin.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xmin routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XMIN_H_
+#define CLBLAST_ROUTINES_XMIN_H_
+
+#include "routine.hpp"
+#include "routines/level1/xamax.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xmin: public Xamax<T> {
+ public:
+
+ // Members and methods from the base class
+ using Xamax<T>::DoAmax;
+
+ // Constructor
+ Xmin(Queue &queue, EventPointer event, const std::string &name = "MIN"):
+ Xamax<T>(queue, event, name) {
+ }
+
+ // Forwards to the regular max-absolute version. The implementation difference is realised in the
+ // kernel through a pre-processor macro based on the name of the routine.
+ StatusCode DoMin(const size_t n,
+ const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XMIN_H_
+#endif
diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc
index 105f991c..97615d8b 100644
--- a/src/routines/level1/xnrm2.cc
+++ b/src/routines/level1/xnrm2.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level1/xnrm2.h"
+#include "routines/level1/xnrm2.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level1/xnrm2.hpp b/src/routines/level1/xnrm2.hpp
new file mode 100644
index 00000000..7baf07f5
--- /dev/null
+++ b/src/routines/level1/xnrm2.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xnrm2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XNRM2_H_
+#define CLBLAST_ROUTINES_XNRM2_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xnrm2: public Routine {
+ public:
+
+ // Constructor
+ Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoNrm2(const size_t n,
+ const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XNRM2_H_
+#endif
diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc
index 3c1b5257..bcc43c3b 100644
--- a/src/routines/level1/xscal.cc
+++ b/src/routines/level1/xscal.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level1/xscal.h"
+#include "routines/level1/xscal.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level1/xscal.hpp b/src/routines/level1/xscal.hpp
new file mode 100644
index 00000000..6c585cb2
--- /dev/null
+++ b/src/routines/level1/xscal.hpp
@@ -0,0 +1,39 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xscal routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSCAL_H_
+#define CLBLAST_ROUTINES_XSCAL_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xscal: public Routine {
+ public:
+
+ // Constructor
+ Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoScal(const size_t n, const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSCAL_H_
+#endif
diff --git a/src/routines/level1/xsum.hpp b/src/routines/level1/xsum.hpp
new file mode 100644
index 00000000..84e20bea
--- /dev/null
+++ b/src/routines/level1/xsum.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsum routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSUM_H_
+#define CLBLAST_ROUTINES_XSUM_H_
+
+#include "routine.hpp"
+#include "routines/level1/xasum.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsum: public Xasum<T> {
+ public:
+
+ // Members and methods from the base class
+ using Xasum<T>::DoAsum;
+
+ // Constructor
+ Xsum(Queue &queue, EventPointer event, const std::string &name = "SUM"):
+ Xasum<T>(queue, event, name) {
+ }
+
+ // Forwards to the regular absolute version. The implementation difference is realised in the
+ // kernel through a pre-processor macro based on the name of the routine.
+ StatusCode DoSum(const size_t n,
+ const Buffer<T> &sum_buffer, const size_t sum_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSUM_H_
+#endif
diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc
index 27eb9b13..03907cbd 100644
--- a/src/routines/level1/xswap.cc
+++ b/src/routines/level1/xswap.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level1/xswap.h"
+#include "routines/level1/xswap.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level1/xswap.hpp b/src/routines/level1/xswap.hpp
new file mode 100644
index 00000000..4f9ea36d
--- /dev/null
+++ b/src/routines/level1/xswap.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xswap routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSWAP_H_
+#define CLBLAST_ROUTINES_XSWAP_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xswap: public Routine {
+ public:
+
+ // Constructor
+ Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSwap(const size_t n,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSWAP_H_
+#endif
diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc
index 7a30c34a..ea4f001c 100644
--- a/src/routines/level2/xgbmv.cc
+++ b/src/routines/level2/xgbmv.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xgbmv.h"
+#include "routines/level2/xgbmv.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level2/xgbmv.hpp b/src/routines/level2/xgbmv.hpp
new file mode 100644
index 00000000..686ab642
--- /dev/null
+++ b/src/routines/level2/xgbmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGBMV_H_
+#define CLBLAST_ROUTINES_XGBMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgbmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGBMV_H_
+#endif
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
index ccadd131..21fb397c 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level2/xgemv.hpp b/src/routines/level2/xgemv.hpp
new file mode 100644
index 00000000..e9afec8d
--- /dev/null
+++ b/src/routines/level2/xgemv.hpp
@@ -0,0 +1,56 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemv routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGEMV_H_
+#define CLBLAST_ROUTINES_XGEMV_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgemv: public Routine {
+ public:
+
+ // Constructor
+ Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+
+ // Generic version used also for other matrix-vector multiplications
+ StatusCode MatVec(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ bool fast_kernel, bool fast_kernel_rot,
+ const size_t parameter, const bool packed,
+ const size_t kl, const size_t ku);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGEMV_H_
+#endif
diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc
index 6ceaa00e..353047d2 100644
--- a/src/routines/level2/xger.cc
+++ b/src/routines/level2/xger.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xger.h"
+#include "routines/level2/xger.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level2/xger.hpp b/src/routines/level2/xger.hpp
new file mode 100644
index 00000000..3c6abe44
--- /dev/null
+++ b/src/routines/level2/xger.hpp
@@ -0,0 +1,43 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xger routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGER_H_
+#define CLBLAST_ROUTINES_XGER_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xger: public Routine {
+ public:
+
+ // Constructor
+ Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoGer(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGER_H_
+#endif
diff --git a/src/routines/level2/xgerc.cc b/src/routines/level2/xgerc.cc
index 73284b52..d9feda97 100644
--- a/src/routines/level2/xgerc.cc
+++ b/src/routines/level2/xgerc.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xgerc.h"
+#include "routines/level2/xgerc.hpp"
#include <string>
diff --git a/src/routines/level2/xgerc.hpp b/src/routines/level2/xgerc.hpp
new file mode 100644
index 00000000..f1d04dfd
--- /dev/null
+++ b/src/routines/level2/xgerc.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgerc routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGERC_H_
+#define CLBLAST_ROUTINES_XGERC_H_
+
+#include "routines/level2/xger.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgerc: public Xger<T> {
+ public:
+
+ // Uses the regular Xger routine
+ using Xger<T>::DoGer;
+
+ // Constructor
+ Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoGerc(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGERC_H_
+#endif
diff --git a/src/routines/level2/xgeru.cc b/src/routines/level2/xgeru.cc
index 7730d6a5..da9e91c2 100644
--- a/src/routines/level2/xgeru.cc
+++ b/src/routines/level2/xgeru.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xgeru.h"
+#include "routines/level2/xgeru.hpp"
#include <string>
diff --git a/src/routines/level2/xgeru.hpp b/src/routines/level2/xgeru.hpp
new file mode 100644
index 00000000..fb50e917
--- /dev/null
+++ b/src/routines/level2/xgeru.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgeru routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGERU_H_
+#define CLBLAST_ROUTINES_XGERU_H_
+
+#include "routines/level2/xger.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgeru: public Xger<T> {
+ public:
+
+ // Uses the regular Xger routine
+ using Xger<T>::DoGer;
+
+ // Constructor
+ Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoGeru(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGERU_H_
+#endif
diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc
index 58591b50..f6c0e3c4 100644
--- a/src/routines/level2/xhbmv.cc
+++ b/src/routines/level2/xhbmv.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xhbmv.h"
+#include "routines/level2/xhbmv.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level2/xhbmv.hpp b/src/routines/level2/xhbmv.hpp
new file mode 100644
index 00000000..d668eb88
--- /dev/null
+++ b/src/routines/level2/xhbmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHBMV_H_
+#define CLBLAST_ROUTINES_XHBMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhbmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoHbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHBMV_H_
+#endif
diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc
index b4ef0fa4..2cbcf7b4 100644
--- a/src/routines/level2/xhemv.cc
+++ b/src/routines/level2/xhemv.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xhemv.h"
+#include "routines/level2/xhemv.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level2/xhemv.hpp b/src/routines/level2/xhemv.hpp
new file mode 100644
index 00000000..8e062fd3
--- /dev/null
+++ b/src/routines/level2/xhemv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHEMV_H_
+#define CLBLAST_ROUTINES_XHEMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhemv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoHemv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHEMV_H_
+#endif
diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc
index 939e17bb..ed8ba9e9 100644
--- a/src/routines/level2/xher.cc
+++ b/src/routines/level2/xher.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xher.h"
+#include "routines/level2/xher.hpp"
#include <string>
diff --git a/src/routines/level2/xher.hpp b/src/routines/level2/xher.hpp
new file mode 100644
index 00000000..9ff6bf3f
--- /dev/null
+++ b/src/routines/level2/xher.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER_H_
+#define CLBLAST_ROUTINES_XHER_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xher: public Routine {
+ public:
+
+ // Constructor
+ Xher(Queue &queue, EventPointer event, const std::string &name = "HER");
+
+ // Translates alpha of type 'U' into type 'T'
+ T GetAlpha(const U alpha);
+
+ // Templated-precision implementation of the routine
+ StatusCode DoHer(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const U alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const bool packed = false);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER_H_
+#endif
diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc
index 95dbd87a..50572cea 100644
--- a/src/routines/level2/xher2.cc
+++ b/src/routines/level2/xher2.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xher2.h"
+#include "routines/level2/xher2.hpp"
#include <string>
diff --git a/src/routines/level2/xher2.hpp b/src/routines/level2/xher2.hpp
new file mode 100644
index 00000000..8c53c047
--- /dev/null
+++ b/src/routines/level2/xher2.hpp
@@ -0,0 +1,44 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER2_H_
+#define CLBLAST_ROUTINES_XHER2_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xher2: public Routine {
+ public:
+
+ // Constructor
+ Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoHer2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const bool packed = false);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER2_H_
+#endif
diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc
index 92686dbe..e6f82b34 100644
--- a/src/routines/level2/xhpmv.cc
+++ b/src/routines/level2/xhpmv.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xhpmv.h"
+#include "routines/level2/xhpmv.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level2/xhpmv.hpp b/src/routines/level2/xhpmv.hpp
new file mode 100644
index 00000000..b11192f9
--- /dev/null
+++ b/src/routines/level2/xhpmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhpmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPMV_H_
+#define CLBLAST_ROUTINES_XHPMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhpmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoHpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPMV_H_
+#endif
diff --git a/src/routines/level2/xhpr.cc b/src/routines/level2/xhpr.cc
index 4b31ad09..225ebfe5 100644
--- a/src/routines/level2/xhpr.cc
+++ b/src/routines/level2/xhpr.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xhpr.h"
+#include "routines/level2/xhpr.hpp"
#include <string>
diff --git a/src/routines/level2/xhpr.hpp b/src/routines/level2/xhpr.hpp
new file mode 100644
index 00000000..37801c68
--- /dev/null
+++ b/src/routines/level2/xhpr.hpp
@@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpr routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPR_H_
+#define CLBLAST_ROUTINES_XHPR_H_
+
+#include "routines/level2/xher.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xhpr: public Xher<T,U> {
+ public:
+
+ // Uses the regular Xher routine
+ using Xher<T,U>::DoHer;
+
+ // Constructor
+ Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoHpr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const U alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPR_H_
+#endif
diff --git a/src/routines/level2/xhpr2.cc b/src/routines/level2/xhpr2.cc
index 9be24f43..85f9d3f9 100644
--- a/src/routines/level2/xhpr2.cc
+++ b/src/routines/level2/xhpr2.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xhpr2.h"
+#include "routines/level2/xhpr2.hpp"
#include <string>
diff --git a/src/routines/level2/xhpr2.hpp b/src/routines/level2/xhpr2.hpp
new file mode 100644
index 00000000..d66dce55
--- /dev/null
+++ b/src/routines/level2/xhpr2.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpr2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPR2_H_
+#define CLBLAST_ROUTINES_XHPR2_H_
+
+#include "routines/level2/xher2.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhpr2: public Xher2<T> {
+ public:
+
+ // Uses the regular Xher2 routine
+ using Xher2<T>::DoHer2;
+
+ // Constructor
+ Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoHpr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPR2_H_
+#endif
diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc
index 66ba74e8..28730899 100644
--- a/src/routines/level2/xsbmv.cc
+++ b/src/routines/level2/xsbmv.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xsbmv.h"
+#include "routines/level2/xsbmv.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level2/xsbmv.hpp b/src/routines/level2/xsbmv.hpp
new file mode 100644
index 00000000..16c5e9a8
--- /dev/null
+++ b/src/routines/level2/xsbmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xsbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSBMV_H_
+#define CLBLAST_ROUTINES_XSBMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsbmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSBMV_H_
+#endif
diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc
index 589a97d4..f6651012 100644
--- a/src/routines/level2/xspmv.cc
+++ b/src/routines/level2/xspmv.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xspmv.h"
+#include "routines/level2/xspmv.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level2/xspmv.hpp b/src/routines/level2/xspmv.hpp
new file mode 100644
index 00000000..a0c69b85
--- /dev/null
+++ b/src/routines/level2/xspmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xspmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPMV_H_
+#define CLBLAST_ROUTINES_XSPMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPMV_H_
+#endif
diff --git a/src/routines/level2/xspr.cc b/src/routines/level2/xspr.cc
index c556b920..a75fe9c3 100644
--- a/src/routines/level2/xspr.cc
+++ b/src/routines/level2/xspr.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xspr.h"
+#include "routines/level2/xspr.hpp"
#include <string>
diff --git a/src/routines/level2/xspr.hpp b/src/routines/level2/xspr.hpp
new file mode 100644
index 00000000..6468c736
--- /dev/null
+++ b/src/routines/level2/xspr.hpp
@@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspr routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPR_H_
+#define CLBLAST_ROUTINES_XSPR_H_
+
+#include "routines/level2/xher.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspr: public Xher<T,T> {
+ public:
+
+ // Uses the regular Xher routine
+ using Xher<T,T>::DoHer;
+
+ // Constructor
+ Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSpr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPR_H_
+#endif
diff --git a/src/routines/level2/xspr2.cc b/src/routines/level2/xspr2.cc
index c4ad5dc4..c39a2eb4 100644
--- a/src/routines/level2/xspr2.cc
+++ b/src/routines/level2/xspr2.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xspr2.h"
+#include "routines/level2/xspr2.hpp"
#include <string>
diff --git a/src/routines/level2/xspr2.hpp b/src/routines/level2/xspr2.hpp
new file mode 100644
index 00000000..693c56a1
--- /dev/null
+++ b/src/routines/level2/xspr2.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspr2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPR2_H_
+#define CLBLAST_ROUTINES_XSPR2_H_
+
+#include "routines/level2/xher2.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspr2: public Xher2<T> {
+ public:
+
+ // Uses the regular Xher2 routine
+ using Xher2<T>::DoHer2;
+
+ // Constructor
+ Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSpr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPR2_H_
+#endif
diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc
index 2a404a8a..648d2a3e 100644
--- a/src/routines/level2/xsymv.cc
+++ b/src/routines/level2/xsymv.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xsymv.h"
+#include "routines/level2/xsymv.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level2/xsymv.hpp b/src/routines/level2/xsymv.hpp
new file mode 100644
index 00000000..67815f2f
--- /dev/null
+++ b/src/routines/level2/xsymv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xsymv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYMV_H_
+#define CLBLAST_ROUTINES_XSYMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsymv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSymv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYMV_H_
+#endif
diff --git a/src/routines/level2/xsyr.cc b/src/routines/level2/xsyr.cc
index 892517d7..758d8f8f 100644
--- a/src/routines/level2/xsyr.cc
+++ b/src/routines/level2/xsyr.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xsyr.h"
+#include "routines/level2/xsyr.hpp"
#include <string>
diff --git a/src/routines/level2/xsyr.hpp b/src/routines/level2/xsyr.hpp
new file mode 100644
index 00000000..20393454
--- /dev/null
+++ b/src/routines/level2/xsyr.hpp
@@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR_H_
+#define CLBLAST_ROUTINES_XSYR_H_
+
+#include "routines/level2/xher.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr: public Xher<T,T> {
+ public:
+
+ // Uses the regular Xher routine
+ using Xher<T,T>::DoHer;
+
+ // Constructor
+ Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSyr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR_H_
+#endif
diff --git a/src/routines/level2/xsyr2.cc b/src/routines/level2/xsyr2.cc
index e6dfd158..6f43b219 100644
--- a/src/routines/level2/xsyr2.cc
+++ b/src/routines/level2/xsyr2.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xsyr2.h"
+#include "routines/level2/xsyr2.hpp"
#include <string>
diff --git a/src/routines/level2/xsyr2.hpp b/src/routines/level2/xsyr2.hpp
new file mode 100644
index 00000000..1a8dcbe8
--- /dev/null
+++ b/src/routines/level2/xsyr2.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR2_H_
+#define CLBLAST_ROUTINES_XSYR2_H_
+
+#include "routines/level2/xher2.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr2: public Xher2<T> {
+ public:
+
+ // Uses the regular Xher2 routine
+ using Xher2<T>::DoHer2;
+
+ // Constructor
+ Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSyr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR2_H_
+#endif
diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc
index 86e28dfb..e315c544 100644
--- a/src/routines/level2/xtbmv.cc
+++ b/src/routines/level2/xtbmv.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xtbmv.h"
+#include "routines/level2/xtbmv.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level2/xtbmv.hpp b/src/routines/level2/xtbmv.hpp
new file mode 100644
index 00000000..389e9705
--- /dev/null
+++ b/src/routines/level2/xtbmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTBMV_H_
+#define CLBLAST_ROUTINES_XTBMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtbmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::queue_;
+ using Xgemv<T>::context_;
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoTbmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTBMV_H_
+#endif
diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc
index 72445547..46811089 100644
--- a/src/routines/level2/xtpmv.cc
+++ b/src/routines/level2/xtpmv.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xtpmv.h"
+#include "routines/level2/xtpmv.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level2/xtpmv.hpp b/src/routines/level2/xtpmv.hpp
new file mode 100644
index 00000000..0e8cf1d2
--- /dev/null
+++ b/src/routines/level2/xtpmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTPMV_H_
+#define CLBLAST_ROUTINES_XTPMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtpmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::queue_;
+ using Xgemv<T>::context_;
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoTpmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTPMV_H_
+#endif
diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc
index df6f85a3..d2f24252 100644
--- a/src/routines/level2/xtrmv.cc
+++ b/src/routines/level2/xtrmv.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level2/xtrmv.h"
+#include "routines/level2/xtrmv.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level2/xtrmv.hpp b/src/routines/level2/xtrmv.hpp
new file mode 100644
index 00000000..07dd7841
--- /dev/null
+++ b/src/routines/level2/xtrmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRMV_H_
+#define CLBLAST_ROUTINES_XTRMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::queue_;
+ using Xgemv<T>::context_;
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoTrmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRMV_H_
+#endif
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index 8386ad09..9ea5559c 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level3/xgemm.h"
+#include "routines/level3/xgemm.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp
new file mode 100644
index 00000000..71723d78
--- /dev/null
+++ b/src/routines/level3/xgemm.hpp
@@ -0,0 +1,48 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGEMM_H_
+#define CLBLAST_ROUTINES_XGEMM_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgemm: public Routine {
+ public:
+
+ // Constructor
+ Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ protected:
+ // Static variable to get the precision
+ const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGEMM_H_
+#endif
diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc
index 8120c09c..9813503e 100644
--- a/src/routines/level3/xhemm.cc
+++ b/src/routines/level3/xhemm.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level3/xhemm.h"
+#include "routines/level3/xhemm.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp
new file mode 100644
index 00000000..d79b42a1
--- /dev/null
+++ b/src/routines/level3/xhemm.hpp
@@ -0,0 +1,54 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemm routine. It is based on the generalized matrix multiplication
+// routine (Xgemm). The implementation is very similar to the Xsymm routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHEMM_H_
+#define CLBLAST_ROUTINES_XHEMM_H_
+
+#include "routines/level3/xgemm.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhemm: public Xgemm<T> {
+ public:
+
+ // Uses methods and variables the regular Xgemm routine
+ using Xgemm<T>::precision_;
+ using Xgemm<T>::routine_name_;
+ using Xgemm<T>::queue_;
+ using Xgemm<T>::context_;
+ using Xgemm<T>::device_;
+ using Xgemm<T>::db_;
+ using Xgemm<T>::DoGemm;
+
+ // Constructor
+ Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHEMM_H_
+#endif
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
index bd0f83dd..bd7a053e 100644
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level3/xher2k.h"
+#include "routines/level3/xher2k.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level3/xher2k.hpp b/src/routines/level3/xher2k.hpp
new file mode 100644
index 00000000..23996219
--- /dev/null
+++ b/src/routines/level3/xher2k.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2k routine. The precision is implemented using the template argument
+// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
+// Xsyr2k routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER2K_H_
+#define CLBLAST_ROUTINES_XHER2K_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xher2k: public Routine {
+ public:
+
+ // Constructor
+ Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const U beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER2K_H_
+#endif
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
index 6155734a..6ef7f21f 100644
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level3/xherk.h"
+#include "routines/level3/xherk.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level3/xherk.hpp b/src/routines/level3/xherk.hpp
new file mode 100644
index 00000000..3f156a1b
--- /dev/null
+++ b/src/routines/level3/xherk.hpp
@@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xherk routine. The precision is implemented using the template argument
+// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
+// Xsyrk routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHERK_H_
+#define CLBLAST_ROUTINES_XHERK_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xherk: public Routine {
+ public:
+
+ // Constructor
+ Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const size_t n, const size_t k,
+ const U alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const U beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHERK_H_
+#endif
diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc
index c5e56617..04e4b718 100644
--- a/src/routines/level3/xsymm.cc
+++ b/src/routines/level3/xsymm.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level3/xsymm.h"
+#include "routines/level3/xsymm.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp
new file mode 100644
index 00000000..754dd7a0
--- /dev/null
+++ b/src/routines/level3/xsymm.hpp
@@ -0,0 +1,56 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm routine. It is based on the generalized matrix multiplication
+// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the
+// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by
+// transforming it into a general matrix, and then calls the regular GEMM code.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYMM_H_
+#define CLBLAST_ROUTINES_XSYMM_H_
+
+#include "routines/level3/xgemm.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsymm: public Xgemm<T> {
+ public:
+
+ // Uses methods and variables the regular Xgemm routine
+ using Xgemm<T>::precision_;
+ using Xgemm<T>::routine_name_;
+ using Xgemm<T>::queue_;
+ using Xgemm<T>::context_;
+ using Xgemm<T>::device_;
+ using Xgemm<T>::db_;
+ using Xgemm<T>::DoGemm;
+
+ // Constructor
+ Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYMM_H_
+#endif
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
index f9655889..424d4d2d 100644
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level3/xsyr2k.h"
+#include "routines/level3/xsyr2k.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level3/xsyr2k.hpp b/src/routines/level3/xsyr2k.hpp
new file mode 100644
index 00000000..56185653
--- /dev/null
+++ b/src/routines/level3/xsyr2k.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k routine. The precision is implemented using a template argument.
+// The implementation is very similar to Xsyrk (see header for details), except for the fact that
+// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR2K_H_
+#define CLBLAST_ROUTINES_XSYR2K_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr2k: public Routine {
+ public:
+
+ // Constructor
+ Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR2K_H_
+#endif
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
index bceb6afd..f56c232b 100644
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level3/xsyrk.h"
+#include "routines/level3/xsyrk.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level3/xsyrk.hpp b/src/routines/level3/xsyrk.hpp
new file mode 100644
index 00000000..7c075c26
--- /dev/null
+++ b/src/routines/level3/xsyrk.hpp
@@ -0,0 +1,47 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk routine. The precision is implemented using a template argument.
+// The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
+// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
+// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
+// performance reasons, as the actual masking is done later (see the first point).
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYRK_H_
+#define CLBLAST_ROUTINES_XSYRK_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyrk: public Routine {
+ public:
+
+ // Constructor
+ Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYRK_H_
+#endif
diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc
index 92dda9fb..74a82822 100644
--- a/src/routines/level3/xtrmm.cc
+++ b/src/routines/level3/xtrmm.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/level3/xtrmm.h"
+#include "routines/level3/xtrmm.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp
new file mode 100644
index 00000000..bb435592
--- /dev/null
+++ b/src/routines/level3/xtrmm.hpp
@@ -0,0 +1,54 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmm routine. The implementation is based on first transforming the
+// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM
+// routine. Therefore, this class inherits from the Xgemm class.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRMM_H_
+#define CLBLAST_ROUTINES_XTRMM_H_
+
+#include "routines/level3/xgemm.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrmm: public Xgemm<T> {
+ public:
+
+ // Uses methods and variables the regular Xgemm routine
+ using Xgemm<T>::precision_;
+ using Xgemm<T>::routine_name_;
+ using Xgemm<T>::queue_;
+ using Xgemm<T>::context_;
+ using Xgemm<T>::device_;
+ using Xgemm<T>::db_;
+ using Xgemm<T>::DoGemm;
+
+ // Constructor
+ Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRMM_H_
+#endif
diff --git a/src/routines/levelx/xomatcopy.cc b/src/routines/levelx/xomatcopy.cc
index 6e4bddb2..e8593301 100644
--- a/src/routines/levelx/xomatcopy.cc
+++ b/src/routines/levelx/xomatcopy.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/routines/levelx/xomatcopy.h"
+#include "routines/levelx/xomatcopy.hpp"
#include <string>
#include <vector>
diff --git a/src/routines/levelx/xomatcopy.hpp b/src/routines/levelx/xomatcopy.hpp
new file mode 100644
index 00000000..0e580230
--- /dev/null
+++ b/src/routines/levelx/xomatcopy.hpp
@@ -0,0 +1,41 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xomatcopy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XOMATCOPY_H_
+#define CLBLAST_ROUTINES_XOMATCOPY_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xomatcopy: public Routine {
+ public:
+
+ // Constructor
+ Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XOMATCOPY_H_
+#endif
diff --git a/src/tuning/copy_fast.cc b/src/tuning/kernels/copy_fast.cc
index 09fdbaba..34269bc7 100644
--- a/src/tuning/copy_fast.cc
+++ b/src/tuning/kernels/copy_fast.cc
@@ -14,8 +14,8 @@
#include <string>
#include <vector>
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/tuning/copy_pad.cc b/src/tuning/kernels/copy_pad.cc
index 7088b3bf..1e0dccd3 100644
--- a/src/tuning/copy_pad.cc
+++ b/src/tuning/kernels/copy_pad.cc
@@ -14,8 +14,8 @@
#include <string>
#include <vector>
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/tuning/transpose_fast.cc b/src/tuning/kernels/transpose_fast.cc
index 3b0bdeb5..7ac19cb6 100644
--- a/src/tuning/transpose_fast.cc
+++ b/src/tuning/kernels/transpose_fast.cc
@@ -14,8 +14,8 @@
#include <string>
#include <vector>
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/tuning/transpose_pad.cc b/src/tuning/kernels/transpose_pad.cc
index b9ab3ffa..63274415 100644
--- a/src/tuning/transpose_pad.cc
+++ b/src/tuning/kernels/transpose_pad.cc
@@ -14,8 +14,8 @@
#include <string>
#include <vector>
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/tuning/xaxpy.cc b/src/tuning/kernels/xaxpy.cc
index d27cb73d..88d12c1f 100644
--- a/src/tuning/xaxpy.cc
+++ b/src/tuning/kernels/xaxpy.cc
@@ -14,8 +14,8 @@
#include <string>
#include <vector>
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/tuning/xdot.cc b/src/tuning/kernels/xdot.cc
index 5f30296c..1581e13f 100644
--- a/src/tuning/xdot.cc
+++ b/src/tuning/kernels/xdot.cc
@@ -15,8 +15,8 @@
#include <string>
#include <vector>
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/tuning/xgemm.cc b/src/tuning/kernels/xgemm.cc
index d309b830..4b1efdef 100644
--- a/src/tuning/xgemm.cc
+++ b/src/tuning/kernels/xgemm.cc
@@ -14,8 +14,8 @@
#include <string>
#include <vector>
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/tuning/xgemv.cc b/src/tuning/kernels/xgemv.cc
index 6587dcf4..d42155ae 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/kernels/xgemv.cc
@@ -17,8 +17,8 @@
#include <string>
#include <vector>
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/tuning/xger.cc b/src/tuning/kernels/xger.cc
index 4be80c86..d2590c53 100644
--- a/src/tuning/xger.cc
+++ b/src/tuning/kernels/xger.cc
@@ -14,8 +14,8 @@
#include <string>
#include <vector>
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
namespace clblast {
// =================================================================================================
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
new file mode 100644
index 00000000..19df5f9a
--- /dev/null
+++ b/src/tuning/tuning.hpp
@@ -0,0 +1,161 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the interface to the CLTune auto-tuner. This is only used for the optional
+// and stand-alone tuner binaries and not part of the core of CLBlast.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TUNING_H_
+#define CLBLAST_TUNING_H_
+
+#include <vector>
+#include <string>
+
+#include <cltune.h>
+
+#include "utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for all types of kernel families. Note that this is a header-only function so
+// that it is automatically compiled for the various kernels (given as the 'C' template argument).
+template <typename C, typename T>
+void Tuner(int argc, char* argv[]) {
+
+ // Sets the parameters and platform/device for which to tune (command-line options)
+ auto help = std::string{"* Options given/available:\n"};
+ auto args = Arguments<T>{};
+ args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+ args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+ args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+ for (auto &o: C::GetOptions()) {
+ if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, C::DefaultM()); }
+ if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, C::DefaultN()); }
+ if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, C::DefaultK()); }
+ if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); }
+ if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
+ if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); }
+ }
+ fprintf(stdout, "%s\n", help.c_str());
+
+ // Tests validity of the given arguments
+ C::TestValidArguments(args);
+
+ // Tests for validity of the precision and retrieves properties
+ auto isAMD = false;
+ auto isARM = false;
+ auto isGPU = false;
+ {
+ const auto platform = Platform(args.platform_id);
+ const auto device = Device(platform, args.device_id);
+ if (!PrecisionSupported<T>(device)) {
+ printf("* Unsupported precision, skipping this tuning run\n\n");
+ return;
+ }
+ isAMD = device.IsAMD();
+ isARM = device.IsARM();
+ isGPU = device.IsGPU();
+ }
+
+ // Creates input buffers with random data
+ auto x_vec = std::vector<T>(C::GetSizeX(args));
+ auto y_vec = std::vector<T>(C::GetSizeY(args));
+ auto a_mat = std::vector<T>(C::GetSizeA(args));
+ auto b_mat = std::vector<T>(C::GetSizeB(args));
+ auto c_mat = std::vector<T>(C::GetSizeC(args));
+ auto temp = std::vector<T>(C::GetSizeTemp(args));
+ PopulateVector(x_vec);
+ PopulateVector(y_vec);
+ PopulateVector(a_mat);
+ PopulateVector(b_mat);
+ PopulateVector(c_mat);
+ PopulateVector(temp);
+
+ // Initializes the tuner for the chosen device
+ cltune::Tuner tuner(args.platform_id, args.device_id);
+
+ // Use full-search to explore all parameter combinations or random-search to search only a part of
+ // the parameter values. The fraction is set as a command-line argument.
+ if (args.fraction == 1.0 || args.fraction == 0.0) {
+ tuner.UseFullSearch();
+ }
+ else {
+ tuner.UseRandomSearch(1.0/args.fraction);
+ }
+
+ // Set extra settings for specific defines. This mimics src/routine.cc.
+ auto defines = std::string{""};
+ if (isAMD && isGPU) {
+ defines += "#define USE_CL_MAD 1\n";
+ defines += "#define USE_STAGGERED_INDICES 1\n";
+ }
+ if (isARM && isGPU) {
+ defines += "#define GLOBAL_MEM_FENCE 1\n";
+ }
+
+ // Loads the kernel sources and defines the kernel to tune
+ auto sources = defines + C::GetSources();
+ auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize());
+ tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef());
+
+ // Sets the tunable parameters and their possible values
+ C::SetParameters(tuner, id);
+ C::SetConstraints(tuner, id);
+ C::SetLocalMemorySize(tuner, id, args);
+
+ // Tests for a specific precision
+ tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+ tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+ // Modifies the thread-sizes (both global and local) based on the parameters
+ for (auto &parameters: C::MulLocal()) { tuner.MulLocalSize(id, parameters); }
+ for (auto &parameters: C::DivLocal()) { tuner.DivLocalSize(id, parameters); }
+ for (auto &parameters: C::MulGlobal()) { tuner.MulGlobalSize(id, parameters); }
+ for (auto &parameters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); }
+
+ // Sets the function's arguments
+ C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);
+
+ // Starts the tuning process
+ tuner.Tune();
+
+ // Prints the results to screen
+ auto time_ms = tuner.PrintToScreen();
+ tuner.PrintFormatted();
+
+ // Also prints the performance of the best-case in terms of GB/s or GFLOPS
+ if (time_ms != 0.0) {
+ printf("[ -------> ] %.1lf ms", time_ms);
+ printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str());
+ }
+
+ // Outputs the results as JSON to disk, including some meta-data
+ auto precision_string = std::to_string(static_cast<size_t>(args.precision));
+ auto metadata = std::vector<std::pair<std::string,std::string>>{
+ {"kernel_family", C::KernelFamily()},
+ {"precision", precision_string}
+ };
+ for (auto &o: C::GetOptions()) {
+ if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); }
+ if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); }
+ if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); }
+ if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
+ if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); }
+ }
+ tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata);
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TUNING_H_
+#endif
diff --git a/src/utilities.cc b/src/utilities.cc
index 30b09a5f..e3a1fb75 100644
--- a/src/utilities.cc
+++ b/src/utilities.cc
@@ -11,7 +11,7 @@
//
// =================================================================================================
-#include "internal/utilities.h"
+#include "utilities.hpp"
#include <string>
#include <vector>
diff --git a/src/utilities.hpp b/src/utilities.hpp
new file mode 100644
index 00000000..9a2b9ffc
--- /dev/null
+++ b/src/utilities.hpp
@@ -0,0 +1,257 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides declarations for the common (test) utility functions such as a command-line
+// argument parser. On top of this, it serves as the 'common' header, including the C++ OpenCL
+// wrapper. These utilities are not only used for CLBlast, but also included as part of the tuners,
+// the performance client and the correctness testers.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_UTILITIES_H_
+#define CLBLAST_UTILITIES_H_
+
+#include <string>
+#include <functional>
+#include <complex>
+
+#include "clblast.h"
+#include "clblast_half.h"
+#include "clpp11.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Shorthands for complex data-types
+using float2 = std::complex<float>;
+using double2 = std::complex<double>;
+
+// Khronos OpenCL extensions
+const std::string kKhronosHalfPrecision = "cl_khr_fp16";
+const std::string kKhronosDoublePrecision = "cl_khr_fp64";
+
+// Catched an unknown error
+constexpr auto kUnknownError = -999;
+
+// =================================================================================================
+
+// The routine-specific arguments in string form
+constexpr auto kArgM = "m";
+constexpr auto kArgN = "n";
+constexpr auto kArgK = "k";
+constexpr auto kArgKL = "kl";
+constexpr auto kArgKU = "ku";
+constexpr auto kArgLayout = "layout";
+constexpr auto kArgATransp = "transA";
+constexpr auto kArgBTransp = "transB";
+constexpr auto kArgSide = "side";
+constexpr auto kArgTriangle = "triangle";
+constexpr auto kArgDiagonal = "diagonal";
+constexpr auto kArgXInc = "incx";
+constexpr auto kArgYInc = "incy";
+constexpr auto kArgXOffset = "offx";
+constexpr auto kArgYOffset = "offy";
+constexpr auto kArgALeadDim = "lda";
+constexpr auto kArgBLeadDim = "ldb";
+constexpr auto kArgCLeadDim = "ldc";
+constexpr auto kArgAOffset = "offa";
+constexpr auto kArgBOffset = "offb";
+constexpr auto kArgCOffset = "offc";
+constexpr auto kArgAPOffset = "offap";
+constexpr auto kArgDotOffset = "offdot";
+constexpr auto kArgNrm2Offset = "offnrm2";
+constexpr auto kArgAsumOffset = "offasum";
+constexpr auto kArgImaxOffset = "offimax";
+constexpr auto kArgAlpha = "alpha";
+constexpr auto kArgBeta = "beta";
+
+// The tuner-specific arguments in string form
+constexpr auto kArgFraction = "fraction";
+
+// The client-specific arguments in string form
+constexpr auto kArgCompareclblas = "clblas";
+constexpr auto kArgComparecblas = "cblas";
+constexpr auto kArgStepSize = "step";
+constexpr auto kArgNumSteps = "num_steps";
+constexpr auto kArgNumRuns = "runs";
+
+// The client-specific arguments in string form
+constexpr auto kArgFullTest = "full_test";
+constexpr auto kArgVerbose = "verbose";
+
+// The common arguments in string form
+constexpr auto kArgPlatform = "platform";
+constexpr auto kArgDevice = "device";
+constexpr auto kArgPrecision = "precision";
+constexpr auto kArgHelp = "h";
+constexpr auto kArgQuiet = "q";
+constexpr auto kArgNoAbbreviations = "no_abbrv";
+
+// =================================================================================================
+
+// Returns a scalar with a default value
+template <typename T>
+T GetScalar();
+
+// Returns a scalar of value 1
+template <typename T>
+T ConstantOne();
+
+// =================================================================================================
+
+// Structure containing all possible arguments for test clients, including their default values
+template <typename T>
+struct Arguments {
+ // Routine-specific arguments
+ size_t m = 1;
+ size_t n = 1;
+ size_t k = 1;
+ size_t ku = 1;
+ size_t kl = 1;
+ Layout layout = Layout::kRowMajor;
+ Transpose a_transpose = Transpose::kNo;
+ Transpose b_transpose = Transpose::kNo;
+ Side side = Side::kLeft;
+ Triangle triangle = Triangle::kUpper;
+ Diagonal diagonal = Diagonal::kUnit;
+ size_t x_inc = 1;
+ size_t y_inc = 1;
+ size_t x_offset = 0;
+ size_t y_offset = 0;
+ size_t a_ld = 1;
+ size_t b_ld = 1;
+ size_t c_ld = 1;
+ size_t a_offset = 0;
+ size_t b_offset = 0;
+ size_t c_offset = 0;
+ size_t ap_offset = 0;
+ size_t dot_offset = 0;
+ size_t nrm2_offset = 0;
+ size_t asum_offset = 0;
+ size_t imax_offset = 0;
+ T alpha = ConstantOne<T>();
+ T beta = ConstantOne<T>();
+ size_t x_size = 1;
+ size_t y_size = 1;
+ size_t a_size = 1;
+ size_t b_size = 1;
+ size_t c_size = 1;
+ size_t ap_size = 1;
+ size_t scalar_size = 1;
+ // Tuner-specific arguments
+ double fraction = 1.0;
+ // Client-specific arguments
+ int compare_clblas = 1;
+ int compare_cblas = 1;
+ size_t step = 1;
+ size_t num_steps = 0;
+ size_t num_runs = 10;
+ // Common arguments
+ size_t platform_id = 0;
+ size_t device_id = 0;
+ Precision precision = Precision::kSingle;
+ bool print_help = false;
+ bool silent = false;
+ bool no_abbrv = false;
+};
+
+// Structure containing all possible buffers for test clients
+template <typename T>
+struct Buffers {
+ Buffer<T> x_vec;
+ Buffer<T> y_vec;
+ Buffer<T> a_mat;
+ Buffer<T> b_mat;
+ Buffer<T> c_mat;
+ Buffer<T> ap_mat;
+ Buffer<T> scalar;
+};
+
+// =================================================================================================
+
+// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
+// data-types such as the Layout and Transpose data-types.
+template <typename T>
+std::string ToString(T value);
+
+// =================================================================================================
+
+// Helper for the function "GetArgument"
+template <typename T>
+T ConvertArgument(const char* value);
+
+// Basic argument parser, matching patterns in the form of "-option value" and "--option value"
+template <typename T>
+T GetArgument(const int argc, char *argv[], std::string &help,
+ const std::string &option, const T default_value);
+
+// Returns the precision only
+Precision GetPrecision(const int argc, char *argv[],
+ const Precision default_precision = Precision::kSingle);
+
+// As in "GetArgument", but now only checks whether an argument is given or not
+bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option);
+
+// =================================================================================================
+
+// Helper function to check for errors in the status code
+constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
+
+// =================================================================================================
+
+// Returns a random number to be used as a seed
+unsigned int GetRandomSeed();
+
+// Test/example data lower and upper limit
+constexpr auto kTestDataLowerLimit = -2.0;
+constexpr auto kTestDataUpperLimit = 2.0;
+
+// Populates a vector with random data
+template <typename T>
+void PopulateVector(std::vector<T> &vector);
+
+// =================================================================================================
+
+// Conversion between half and single-precision
+std::vector<float> HalfToFloatBuffer(const std::vector<half>& source);
+void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source);
+
+// As above, but now for OpenCL data-types instead of std::vectors
+Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw);
+void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw);
+
+// =================================================================================================
+
+// Rounding functions
+size_t CeilDiv(const size_t x, const size_t y);
+size_t Ceil(const size_t x, const size_t y);
+
+// Returns whether or not 'a' is a multiple of 'b'
+bool IsMultiple(const size_t a, const size_t b);
+
+// =================================================================================================
+
+// Convert the precision enum into bytes, e.g. a double takes up 8 bytes
+size_t GetBytes(const Precision precision);
+
+// Convert the template argument into a precision value
+template <typename T>
+Precision PrecisionValue();
+
+// =================================================================================================
+
+// Returns false is this precision is not supported by the device
+template <typename T>
+bool PrecisionSupported(const Device &device);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_UTILITIES_H_
+#endif