summaryrefslogtreecommitdiff
path: root/include/internal
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-06-18 20:20:13 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-06-18 20:20:13 +0200
commitf726fbdc9fef937fbe32222f0e66aac8d7e2678c (patch)
treecb62cc877ea239052fb1882f7bf327aace3e7776 /include/internal
parentbacb5d2bb2ea7b141034878090aca850db8f9d00 (diff)
Moved all headers into the source tree, changed headers to .hpp extension
Diffstat (limited to 'include/internal')
-rw-r--r--include/internal/buffer_test.h121
-rw-r--r--include/internal/cache.h98
-rw-r--r--include/internal/clpp11.h695
-rw-r--r--include/internal/database.h104
-rw-r--r--include/internal/database/copy.h262
-rw-r--r--include/internal/database/pad.h270
-rw-r--r--include/internal/database/padtranspose.h270
-rw-r--r--include/internal/database/transpose.h258
-rw-r--r--include/internal/database/xaxpy.h270
-rw-r--r--include/internal/database/xdot.h200
-rw-r--r--include/internal/database/xgemm.h263
-rw-r--r--include/internal/database/xgemv.h231
-rw-r--r--include/internal/database/xger.h220
-rw-r--r--include/internal/public_api.h34
-rw-r--r--include/internal/routine.h68
-rw-r--r--include/internal/routines/common.h173
-rw-r--r--include/internal/routines/level1/xamax.h40
-rw-r--r--include/internal/routines/level1/xasum.h40
-rw-r--r--include/internal/routines/level1/xaxpy.h40
-rw-r--r--include/internal/routines/level1/xcopy.h40
-rw-r--r--include/internal/routines/level1/xdot.h42
-rw-r--r--include/internal/routines/level1/xdotc.h44
-rw-r--r--include/internal/routines/level1/xdotu.h44
-rw-r--r--include/internal/routines/level1/xmax.h49
-rw-r--r--include/internal/routines/level1/xmin.h49
-rw-r--r--include/internal/routines/level1/xnrm2.h40
-rw-r--r--include/internal/routines/level1/xscal.h39
-rw-r--r--include/internal/routines/level1/xsum.h49
-rw-r--r--include/internal/routines/level1/xswap.h40
-rw-r--r--include/internal/routines/level2/xgbmv.h49
-rw-r--r--include/internal/routines/level2/xgemv.h56
-rw-r--r--include/internal/routines/level2/xger.h43
-rw-r--r--include/internal/routines/level2/xgerc.h46
-rw-r--r--include/internal/routines/level2/xgeru.h46
-rw-r--r--include/internal/routines/level2/xhbmv.h49
-rw-r--r--include/internal/routines/level2/xhemv.h49
-rw-r--r--include/internal/routines/level2/xher.h46
-rw-r--r--include/internal/routines/level2/xher2.h44
-rw-r--r--include/internal/routines/level2/xhpmv.h49
-rw-r--r--include/internal/routines/level2/xhpr.h45
-rw-r--r--include/internal/routines/level2/xhpr2.h46
-rw-r--r--include/internal/routines/level2/xsbmv.h49
-rw-r--r--include/internal/routines/level2/xspmv.h49
-rw-r--r--include/internal/routines/level2/xspr.h45
-rw-r--r--include/internal/routines/level2/xspr2.h46
-rw-r--r--include/internal/routines/level2/xsymv.h49
-rw-r--r--include/internal/routines/level2/xsyr.h45
-rw-r--r--include/internal/routines/level2/xsyr2.h46
-rw-r--r--include/internal/routines/level2/xtbmv.h49
-rw-r--r--include/internal/routines/level2/xtpmv.h49
-rw-r--r--include/internal/routines/level2/xtrmv.h49
-rw-r--r--include/internal/routines/level3/xgemm.h48
-rw-r--r--include/internal/routines/level3/xhemm.h54
-rw-r--r--include/internal/routines/level3/xher2k.h46
-rw-r--r--include/internal/routines/level3/xherk.h45
-rw-r--r--include/internal/routines/level3/xsymm.h56
-rw-r--r--include/internal/routines/level3/xsyr2k.h46
-rw-r--r--include/internal/routines/level3/xsyrk.h47
-rw-r--r--include/internal/routines/level3/xtrmm.h54
-rw-r--r--include/internal/routines/levelx/xomatcopy.h41
-rw-r--r--include/internal/tuning.h161
-rw-r--r--include/internal/utilities.h257
62 files changed, 0 insertions, 5992 deletions
diff --git a/include/internal/buffer_test.h b/include/internal/buffer_test.h
deleted file mode 100644
index 80f5243f..00000000
--- a/include/internal/buffer_test.h
+++ /dev/null
@@ -1,121 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are
-// templated and thus header-only.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_BUFFER_TEST_H_
-#define CLBLAST_BUFFER_TEST_H_
-
-#include "clblast.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Tests matrix 'A' for validity
-template <typename T>
-StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
- const size_t offset, const size_t ld) {
- if (ld < one) { return StatusCode::kInvalidLeadDimA; }
- try {
- const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
- } catch (...) { return StatusCode::kInvalidMatrixA; }
- return StatusCode::kSuccess;
-}
-
-// Tests matrix 'B' for validity
-template <typename T>
-StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
- const size_t offset, const size_t ld) {
- if (ld < one) { return StatusCode::kInvalidLeadDimB; }
- try {
- const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
- } catch (...) { return StatusCode::kInvalidMatrixB; }
- return StatusCode::kSuccess;
-}
-
-// Tests matrix 'C' for validity
-template <typename T>
-StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
- const size_t offset, const size_t ld) {
- if (ld < one) { return StatusCode::kInvalidLeadDimC; }
- try {
- const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
- } catch (...) { return StatusCode::kInvalidMatrixC; }
- return StatusCode::kSuccess;
-}
-
-// Tests matrix 'AP' for validity
-template <typename T>
-StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
- try {
- const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
- } catch (...) { return StatusCode::kInvalidMatrixA; }
- return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector 'X' for validity
-template <typename T>
-StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
- const size_t inc) {
- if (inc == 0) { return StatusCode::kInvalidIncrementX; }
- try {
- const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
- } catch (...) { return StatusCode::kInvalidVectorX; }
- return StatusCode::kSuccess;
-}
-
-// Tests vector 'Y' for validity
-template <typename T>
-StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
- const size_t inc) {
- if (inc == 0) { return StatusCode::kInvalidIncrementY; }
- try {
- const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
- } catch (...) { return StatusCode::kInvalidVectorY; }
- return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector 'scalar' for validity
-template <typename T>
-StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
- try {
- const auto required_size = (n + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
- } catch (...) { return StatusCode::kInvalidVectorScalar; }
- return StatusCode::kSuccess;
-}
-
-// Tests vector 'index' for validity
-template <typename T>
-StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
- try {
- const auto required_size = (n + offset) * sizeof(T);
- if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
- } catch (...) { return StatusCode::kInvalidVectorScalar; }
- return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_BUFFER_TEST_H_
-#endif
diff --git a/include/internal/cache.h b/include/internal/cache.h
deleted file mode 100644
index bc7e87d9..00000000
--- a/include/internal/cache.h
+++ /dev/null
@@ -1,98 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the caching functionality of compiled binaries and programs.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_CACHE_H_
-#define CLBLAST_CACHE_H_
-
-#include <string>
-#include <vector>
-#include <mutex>
-
-#include "internal/utilities.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The cache of compiled OpenCL binaries, along with some meta-data
-struct BinaryCache {
- std::string binary;
- std::string device_name;
- Precision precision;
- std::string routine_name_;
-
- // Finds out whether the properties match
- bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
- const std::string &ref_routine) {
- return (device_name == ref_device &&
- precision == ref_precision &&
- routine_name_ == ref_routine);
- }
-};
-
-// The actual cache, implemented as a vector of the above data-type, and its mutex
-static std::vector<BinaryCache> binary_cache_;
-static std::mutex binary_cache_mutex_;
-
-// =================================================================================================
-
-// The cache of compiled OpenCL programs, along with some meta-data
-struct ProgramCache {
- Program program;
- ContextPointer context_ptr;
- Precision precision;
- std::string routine_name_;
-
- // Finds out whether the properties match
- bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision,
- const std::string &ref_routine) {
- return (context_ptr == ref_context &&
- precision == ref_precision &&
- routine_name_ == ref_routine);
- }
-};
-
-// The actual cache, implemented as a vector of the above data-type, and its mutex
-static std::vector<ProgramCache> program_cache_;
-static std::mutex program_cache_mutex_;
-
-// =================================================================================================
-
-// Stores the compiled binary or program in the cache
-void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
- const Precision &precision, const std::string &routine_name);
-void StoreProgramToCache(const Program &program, const Context &context,
- const Precision &precision, const std::string &routine_name);
-
-// Queries the cache and retrieves a matching binary or program. Assumes that the match is
-// available, throws otherwise.
-const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
- const std::string &routine_name);
-const Program& GetProgramFromCache(const Context &context, const Precision &precision,
- const std::string &routine_name);
-
-// Queries the cache to see whether or not the compiled kernel is already there
-bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
- const std::string &routine_name);
-bool ProgramIsInCache(const Context &context, const Precision &precision,
- const std::string &routine_name);
-
-// =================================================================================================
-
-// Clears the cache of stored binaries
-StatusCode CacheClearAll();
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_CACHE_H_
-#endif
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
deleted file mode 100644
index b834d8b4..00000000
--- a/include/internal/clpp11.h
+++ /dev/null
@@ -1,695 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API
-// calls. The main benefits are increased abstraction, automatic memory management, and portability.
-// Portability here means that a similar header exists for CUDA with the same classes and
-// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
-//
-// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore
-// contains the following header copyright notice:
-//
-// =================================================================================================
-//
-// Copyright 2015 SURFsara
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_CLPP11_H_
-#define CLBLAST_CLPP11_H_
-
-// C++
-#include <algorithm> // std::copy
-#include <string> // std::string
-#include <vector> // std::vector
-#include <memory> // std::shared_ptr
-#include <stdexcept> // std::runtime_error
-#include <numeric> // std::accumulate
-
-// OpenCL
-#if defined(__APPLE__) || defined(__MACOSX)
- #include <OpenCL/opencl.h>
-#else
- #include <CL/opencl.h>
-#endif
-
-namespace clblast {
-// =================================================================================================
-
-// Error occurred in the C++11 OpenCL header (this file)
-inline void Error(const std::string &message) {
- throw std::runtime_error("Internal OpenCL error: "+message);
-}
-
-// Error occurred in OpenCL
-inline void CheckError(const cl_int status) {
- if (status != CL_SUCCESS) {
- throw std::runtime_error("Internal OpenCL error: "+std::to_string(status));
- }
-}
-
-// =================================================================================================
-
-// C++11 version of 'cl_event'
-class Event {
- public:
-
- // Constructor based on the regular OpenCL data-type
- explicit Event(const cl_event event): event_(event) { }
-
- // Regular constructor
- explicit Event(): event_(nullptr) { }
-
- // Waits for completion of this event
- void WaitForCompletion() const {
- CheckError(clWaitForEvents(1, &event_));
- }
-
- // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
- // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
- // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
- float GetElapsedTime() const {
- WaitForCompletion();
- auto bytes = size_t{0};
- clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
- auto time_start = size_t{0};
- clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
- clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
- auto time_end = size_t{0};
- clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
- return (time_end - time_start) * 1.0e-6f;
- }
-
- // Accessor to the private data-member
- cl_event& operator()() { return event_; }
- cl_event* pointer() { return &event_; }
- private:
- cl_event event_;
-};
-
-// Pointer to an OpenCL event
-using EventPointer = cl_event*;
-
-// =================================================================================================
-
-// C++11 version of 'cl_platform_id'
-class Platform {
- public:
-
- // Constructor based on the regular OpenCL data-type
- explicit Platform(const cl_platform_id platform): platform_(platform) { }
-
- // Initializes the platform
- explicit Platform(const size_t platform_id) {
- auto num_platforms = cl_uint{0};
- CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
- if (num_platforms == 0) { Error("no platforms found"); }
- auto platforms = std::vector<cl_platform_id>(num_platforms);
- CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr));
- if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
- platform_ = platforms[platform_id];
- }
-
- // Returns the number of devices on this platform
- size_t NumDevices() const {
- auto result = cl_uint{0};
- CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result));
- return static_cast<size_t>(result);
- }
-
- // Accessor to the private data-member
- const cl_platform_id& operator()() const { return platform_; }
- private:
- cl_platform_id platform_;
-};
-
-// =================================================================================================
-
-// C++11 version of 'cl_device_id'
-class Device {
- public:
-
- // Constructor based on the regular OpenCL data-type
- explicit Device(const cl_device_id device): device_(device) { }
-
- // Initialize the device. Note that this constructor can throw exceptions!
- explicit Device(const Platform &platform, const size_t device_id) {
- auto num_devices = platform.NumDevices();
- if (num_devices == 0) { Error("no devices found"); }
- auto devices = std::vector<cl_device_id>(num_devices);
- CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
- devices.data(), nullptr));
- if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
- device_ = devices[device_id];
- }
-
- // Methods to retrieve device information
- std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); }
- std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); }
- std::string Name() const { return GetInfoString(CL_DEVICE_NAME); }
- std::string Type() const {
- auto type = GetInfo<cl_device_type>(CL_DEVICE_TYPE);
- switch(type) {
- case CL_DEVICE_TYPE_CPU: return "CPU";
- case CL_DEVICE_TYPE_GPU: return "GPU";
- case CL_DEVICE_TYPE_ACCELERATOR: return "accelerator";
- default: return "default";
- }
- }
- size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
- size_t MaxWorkItemDimensions() const {
- return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
- }
- std::vector<size_t> MaxWorkItemSizes() const {
- return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
- }
- size_t LocalMemSize() const {
- return static_cast<size_t>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
- }
- std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
- size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
- size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
- size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
- size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
- size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
- size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
-
- // Configuration-validity checks
- bool IsLocalMemoryValid(const size_t local_mem_usage) const {
- return (local_mem_usage <= LocalMemSize());
- }
- bool IsThreadConfigValid(const std::vector<size_t> &local) const {
- auto local_size = size_t{1};
- for (const auto &item: local) { local_size *= item; }
- for (auto i=size_t{0}; i<local.size(); ++i) {
- if (local[i] > MaxWorkItemSizes()[i]) { return false; }
- }
- if (local_size > MaxWorkGroupSize()) { return false; }
- if (local.size() > MaxWorkItemDimensions()) { return false; }
- return true;
- }
-
- // Query for a specific type of device or brand
- bool IsCPU() const { return Type() == "CPU"; }
- bool IsGPU() const { return Type() == "GPU"; }
- bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
- bool IsARM() const { return Vendor() == "ARM"; }
-
- // Accessor to the private data-member
- const cl_device_id& operator()() const { return device_; }
- private:
- cl_device_id device_;
-
- // Private helper functions
- template <typename T>
- T GetInfo(const cl_device_info info) const {
- auto bytes = size_t{0};
- CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
- auto result = T(0);
- CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
- return result;
- }
- size_t GetInfo(const cl_device_info info) const {
- auto bytes = size_t{0};
- CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
- auto result = cl_uint(0);
- CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
- return static_cast<size_t>(result);
- }
- template <typename T>
- std::vector<T> GetInfoVector(const cl_device_info info) const {
- auto bytes = size_t{0};
- CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
- auto result = std::vector<T>(bytes/sizeof(T));
- CheckError(clGetDeviceInfo(device_, info, bytes, result.data(), nullptr));
- return result;
- }
- std::string GetInfoString(const cl_device_info info) const {
- auto bytes = size_t{0};
- CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
- auto result = std::string{};
- result.resize(bytes);
- CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
- return std::string{result.c_str()}; // Removes any trailing '\0'-characters
- }
-};
-
-// =================================================================================================
-
-// C++11 version of 'cl_context'
-class Context {
- public:
-
- // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
- explicit Context(const cl_context context):
- context_(new cl_context) {
- *context_ = context;
- }
-
- // Regular constructor with memory management
- explicit Context(const Device &device):
- context_(new cl_context, [](cl_context* c) { CheckError(clReleaseContext(*c)); delete c; }) {
- auto status = CL_SUCCESS;
- const cl_device_id dev = device();
- *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
- CheckError(status);
- }
-
- // Accessor to the private data-member
- const cl_context& operator()() const { return *context_; }
- cl_context* pointer() const { return &(*context_); }
- private:
- std::shared_ptr<cl_context> context_;
-};
-
-// Pointer to an OpenCL context
-using ContextPointer = cl_context*;
-
-// =================================================================================================
-
-// Enumeration of build statuses of the run-time compilation process
-enum class BuildStatus { kSuccess, kError, kInvalid };
-
-// C++11 version of 'cl_program'. Additionally holds the program's source code.
-class Program {
- public:
- // Note that there is no constructor based on the regular OpenCL data-type because of extra state
-
- // Source-based constructor with memory management
- explicit Program(const Context &context, std::string source):
- program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
- length_(source.length()),
- source_(std::move(source)),
- source_ptr_(&source_[0]) {
- auto status = CL_SUCCESS;
- *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
- CheckError(status);
- }
-
- // Binary-based constructor with memory management
- explicit Program(const Device &device, const Context &context, const std::string& binary):
- program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
- length_(binary.length()),
- source_(binary),
- source_ptr_(&source_[0]) {
- auto status1 = CL_SUCCESS;
- auto status2 = CL_SUCCESS;
- const cl_device_id dev = device();
- *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
- reinterpret_cast<const unsigned char**>(&source_ptr_),
- &status1, &status2);
- CheckError(status1);
- CheckError(status2);
- }
-
- // Compiles the device program and returns whether or not there where any warnings/errors
- BuildStatus Build(const Device &device, std::vector<std::string> &options) {
- auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
- const cl_device_id dev = device();
- auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr);
- if (status == CL_BUILD_PROGRAM_FAILURE) {
- return BuildStatus::kError;
- }
- else if (status == CL_INVALID_BINARY) {
- return BuildStatus::kInvalid;
- }
- else {
- CheckError(status);
- return BuildStatus::kSuccess;
- }
- }
-
- // Retrieves the warning/error message from the compiler (if any)
- std::string GetBuildInfo(const Device &device) const {
- auto bytes = size_t{0};
- auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG};
- CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes));
- auto result = std::string{};
- result.resize(bytes);
- CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr));
- return result;
- }
-
- // Retrieves a binary or an intermediate representation of the compiled program
- std::string GetIR() const {
- auto bytes = size_t{0};
- CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
- auto result = std::string{};
- result.resize(bytes);
- auto result_ptr = result.data();
- CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
- return result;
- }
-
- // Accessor to the private data-member
- const cl_program& operator()() const { return *program_; }
- private:
- std::shared_ptr<cl_program> program_;
- size_t length_;
- std::string source_; // Note: the source can also be a binary or IR
- const char* source_ptr_;
-};
-
-// =================================================================================================
-
-// C++11 version of 'cl_command_queue'
-class Queue {
- public:
-
- // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
- explicit Queue(const cl_command_queue queue):
- queue_(new cl_command_queue) {
- *queue_ = queue;
- }
-
- // Regular constructor with memory management
- explicit Queue(const Context &context, const Device &device):
- queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
- delete s; }) {
- auto status = CL_SUCCESS;
- #ifdef CL_VERSION_2_0
- cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
- *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
- #else
- *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
- #endif
- CheckError(status);
- }
-
- // Synchronizes the queue
- void Finish(Event &) const {
- Finish();
- }
- void Finish() const {
- CheckError(clFinish(*queue_));
- }
-
- // Retrieves the corresponding context or device
- Context GetContext() const {
- auto bytes = size_t{0};
- CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes));
- cl_context result;
- CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr));
- return Context(result);
- }
- Device GetDevice() const {
- auto bytes = size_t{0};
- CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes));
- cl_device_id result;
- CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr));
- return Device(result);
- }
-
- // Accessor to the private data-member
- const cl_command_queue& operator()() const { return *queue_; }
- private:
- std::shared_ptr<cl_command_queue> queue_;
-};
-
-// =================================================================================================
-
-// C++11 version of host memory
-template <typename T>
-class BufferHost {
- public:
-
- // Regular constructor with memory management
- explicit BufferHost(const Context &, const size_t size):
- buffer_(new std::vector<T>(size)) {
- }
-
- // Retrieves the actual allocated size in bytes
- size_t GetSize() const {
- return buffer_->size()*sizeof(T);
- }
-
- // Compatibility with std::vector
- size_t size() const { return buffer_->size(); }
- T* begin() { return &(*buffer_)[0]; }
- T* end() { return &(*buffer_)[buffer_->size()-1]; }
- T& operator[](const size_t i) { return (*buffer_)[i]; }
- T* data() { return buffer_->data(); }
- const T* data() const { return buffer_->data(); }
-
- private:
- std::shared_ptr<std::vector<T>> buffer_;
-};
-
-// =================================================================================================
-
-// Enumeration of buffer access types
-enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
-
-// C++11 version of 'cl_mem'
-template <typename T>
-class Buffer {
- public:
-
- // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
- explicit Buffer(const cl_mem buffer):
- buffer_(new cl_mem),
- access_(BufferAccess::kNotOwned) {
- *buffer_ = buffer;
- }
-
- // Regular constructor with memory management. If this class does not own the buffer object, then
- // the memory will not be freed automatically afterwards.
- explicit Buffer(const Context &context, const BufferAccess access, const size_t size):
- buffer_(new cl_mem, [access](cl_mem* m) {
- if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); }
- delete m;
- }),
- access_(access) {
- auto flags = cl_mem_flags{CL_MEM_READ_WRITE};
- if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; }
- if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
- auto status = CL_SUCCESS;
- *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
- CheckError(status);
- }
-
- // As above, but now with read/write access as a default
- explicit Buffer(const Context &context, const size_t size):
- Buffer<T>(context, BufferAccess::kReadWrite, size) {
- }
-
- // Constructs a new buffer based on an existing host-container
- template <typename Iterator>
- explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
- Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
- auto size = static_cast<size_t>(end - start);
- auto pointer = &*start;
- CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0,
- nullptr, nullptr));
- queue.Finish();
- }
-
- // Copies from device to host: reading the device buffer a-synchronously
- void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
- if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
- CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
- host, 0, nullptr, nullptr));
- }
- void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
- const size_t offset = 0) const {
- if (host.size() < size) { Error("target host buffer is too small"); }
- ReadAsync(queue, size, host.data(), offset);
- }
- void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
- const size_t offset = 0) const {
- if (host.size() < size) { Error("target host buffer is too small"); }
- ReadAsync(queue, size, host.data(), offset);
- }
-
- // Copies from device to host: reading the device buffer
- void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
- ReadAsync(queue, size, host, offset);
- queue.Finish();
- }
- void Read(const Queue &queue, const size_t size, std::vector<T> &host,
- const size_t offset = 0) const {
- Read(queue, size, host.data(), offset);
- }
- void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
- const size_t offset = 0) const {
- Read(queue, size, host.data(), offset);
- }
-
- // Copies from host to device: writing the device buffer a-synchronously
- void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
- if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
- if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
- CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
- host, 0, nullptr, nullptr));
- }
- void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
- const size_t offset = 0) {
- WriteAsync(queue, size, host.data(), offset);
- }
- void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
- const size_t offset = 0) {
- WriteAsync(queue, size, host.data(), offset);
- }
-
- // Copies from host to device: writing the device buffer
- void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
- WriteAsync(queue, size, host, offset);
- queue.Finish();
- }
- void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
- const size_t offset = 0) {
- Write(queue, size, host.data(), offset);
- }
- void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
- const size_t offset = 0) {
- Write(queue, size, host.data(), offset);
- }
-
- // Copies the contents of this buffer into another device buffer
- void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
- CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0,
- nullptr, nullptr));
- }
- void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
- CopyToAsync(queue, size, destination);
- queue.Finish();
- }
-
- // Retrieves the actual allocated size in bytes
- size_t GetSize() const {
- auto bytes = size_t{0};
- CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes));
- auto result = size_t{0};
- CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
- return result;
- }
-
- // Accessor to the private data-member
- const cl_mem& operator()() const { return *buffer_; }
- private:
- std::shared_ptr<cl_mem> buffer_;
- const BufferAccess access_;
-};
-
-// =================================================================================================
-
-// C++11 version of 'cl_kernel'
-class Kernel {
- public:
-
- // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
- explicit Kernel(const cl_kernel kernel):
- kernel_(new cl_kernel) {
- *kernel_ = kernel;
- }
-
- // Regular constructor with memory management
- explicit Kernel(const Program &program, const std::string &name):
- kernel_(new cl_kernel, [](cl_kernel* k) { CheckError(clReleaseKernel(*k)); delete k; }) {
- auto status = CL_SUCCESS;
- *kernel_ = clCreateKernel(program(), name.c_str(), &status);
- CheckError(status);
- }
-
- // Sets a kernel argument at the indicated position
- template <typename T>
- void SetArgument(const size_t index, const T &value) {
- CheckError(clSetKernelArg(*kernel_, static_cast<cl_uint>(index), sizeof(T), &value));
- }
- template <typename T>
- void SetArgument(const size_t index, Buffer<T> &value) {
- SetArgument(index, value());
- }
-
- // Sets all arguments in one go using parameter packs. Note that this overwrites previously set
- // arguments using 'SetArgument' or 'SetArguments'.
- template <typename... Args>
- void SetArguments(Args&... args) {
- SetArgumentsRecursive(0, args...);
- }
-
- // Retrieves the amount of local memory used per work-group for this kernel
- size_t LocalMemUsage(const Device &device) const {
- auto bytes = size_t{0};
- auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
- CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
- auto result = size_t{0};
- CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
- return result;
- }
-
- // Launches a kernel onto the specified queue
- void Launch(const Queue &queue, const std::vector<size_t> &global,
- const std::vector<size_t> &local, EventPointer event) {
- CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
- nullptr, global.data(), local.data(),
- 0, nullptr, event));
- }
-
- // As above, but with an event waiting list
- void Launch(const Queue &queue, const std::vector<size_t> &global,
- const std::vector<size_t> &local, EventPointer event,
- std::vector<Event>& waitForEvents) {
- if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
-
- // Builds a plain version of the events waiting list
- auto waitForEventsPlain = std::vector<cl_event>();
- for (auto &waitEvent : waitForEvents) {
- waitForEventsPlain.push_back(waitEvent());
- }
-
- // Launches the kernel while waiting for other events
- CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
- nullptr, global.data(), local.data(),
- static_cast<cl_uint>(waitForEventsPlain.size()),
- waitForEventsPlain.data(),
- event));
- }
-
- // As above, but with the default local workgroup size
- void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
- CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
- nullptr, global.data(), nullptr,
- 0, nullptr, event));
- }
-
- // Accessor to the private data-member
- const cl_kernel& operator()() const { return *kernel_; }
- private:
- std::shared_ptr<cl_kernel> kernel_;
-
- // Internal implementation for the recursive SetArguments function.
- template <typename T>
- void SetArgumentsRecursive(const size_t index, T &first) {
- SetArgument(index, first);
- }
- template <typename T, typename... Args>
- void SetArgumentsRecursive(const size_t index, T &first, Args&... args) {
- SetArgument(index, first);
- SetArgumentsRecursive(index+1, args...);
- }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_CLPP11_H_
-#endif
diff --git a/include/internal/database.h b/include/internal/database.h
deleted file mode 100644
index f93eaa22..00000000
--- a/include/internal/database.h
+++ /dev/null
@@ -1,104 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Database class, providing a static variable holding the actual database
-// information. The class also provides utility functions to search the database and to access a
-// found entry by parameter-key. The database itself is filled in the corresponding source-file and
-// partially also by the database/xxxxx.h files, in which kernel-specific parameters are found.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_DATABASE_H_
-#define CLBLAST_DATABASE_H_
-
-#include <string>
-#include <vector>
-#include <unordered_map>
-
-#include "internal/utilities.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-class Database {
- public:
-
- // Type alias for the database parameters
- using Parameters = std::unordered_map<std::string,size_t>;
-
- // Structures for content inside the database
- struct DatabaseDevice {
- const std::string name;
- const Parameters parameters;
- };
- struct DatabaseVendor {
- const std::string type;
- const std::string name;
- const std::vector<DatabaseDevice> devices;
- };
- struct DatabaseEntry {
- const std::string kernel;
- const Precision precision;
- const std::vector<DatabaseVendor> vendors;
- };
-
- // The OpenCL device types
- static constexpr auto kDeviceTypeCPU = "CPU";
- static constexpr auto kDeviceTypeGPU = "GPU";
- static constexpr auto kDeviceTypeAccelerator = "accelerator";
- static constexpr auto kDeviceTypeAll = "default";
-
- // The OpenCL device vendors
- static constexpr auto kDeviceVendorAll = "default";
-
- // Alternative names for some OpenCL vendors
- const std::unordered_map<std::string,std::string> kVendorNames {
- {"Intel(R) Corporation", "Intel"},
- {"GenuineIntel", "Intel"},
- {"Advanced Micro Devices, Inc.", "AMD"},
- {"NVIDIA Corporation", "NVIDIA"},
- };
-
- // The database consists of separate database entries, stored together in a vector
- static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
- static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
- static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
- static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
- static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
- static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
- static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
- static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
- static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
- static const std::vector<DatabaseEntry> database;
-
- // The constructor
- explicit Database(const Queue &queue, const std::vector<std::string> &routines,
- const Precision precision);
-
- // Accessor of values by key
- size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
-
- // Obtain a list of OpenCL pre-processor defines based on the parameters
- std::string GetDefines() const;
-
- private:
- Parameters Search(const std::string &this_kernel, const std::string &this_type,
- const std::string &this_vendor, const std::string &this_device,
- const Precision this_precision) const;
-
- // Found parameters suitable for this device/kernel
- Parameters parameters_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_DATABASE_H_
-#endif
diff --git a/include/internal/database/copy.h b/include/internal/database/copy.h
deleted file mode 100644
index 201e8b8a..00000000
--- a/include/internal/database/copy.h
+++ /dev/null
@@ -1,262 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Database generator <database.py>
-//
-// This file populates the database with best-found tuning parameters for the 'Copy' kernels.
-//
-// =================================================================================================
-
-namespace clblast {
-// =================================================================================================
-
-const Database::DatabaseEntry Database::CopyHalf = {
- "Copy", Precision::kHalf, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::CopySingle = {
- "Copy", Precision::kSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
- { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
- { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
- { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
- { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
- { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
- { "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
- { "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
- { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
- { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::CopyComplexSingle = {
- "Copy", Precision::kComplexSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
- { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
- { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
- { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
- { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
- { "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::CopyDouble = {
- "Copy", Precision::kDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
- { "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } },
- { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
- { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::CopyComplexDouble = {
- "Copy", Precision::kComplexDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
- { "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
- { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-} // namespace clblast
diff --git a/include/internal/database/pad.h b/include/internal/database/pad.h
deleted file mode 100644
index cc703dd6..00000000
--- a/include/internal/database/pad.h
+++ /dev/null
@@ -1,270 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Database generator <database.py>
-//
-// This file populates the database with best-found tuning parameters for the 'Pad' kernels.
-//
-// =================================================================================================
-
-namespace clblast {
-// =================================================================================================
-
-const Database::DatabaseEntry Database::PadHalf = {
- "Pad", Precision::kHalf, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::PadSingle = {
- "Pad", Precision::kSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
- { "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
- { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::PadComplexSingle = {
- "Pad", Precision::kComplexSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
- { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
- { "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::PadDouble = {
- "Pad", Precision::kDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX 680", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "GeForce GTX 750 Ti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX 980", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::PadComplexDouble = {
- "Pad", Precision::kComplexDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
- { "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-} // namespace clblast
diff --git a/include/internal/database/padtranspose.h b/include/internal/database/padtranspose.h
deleted file mode 100644
index f3b1f262..00000000
--- a/include/internal/database/padtranspose.h
+++ /dev/null
@@ -1,270 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Database generator <database.py>
-//
-// This file populates the database with best-found tuning parameters for the 'Padtranspose' kernels.
-//
-// =================================================================================================
-
-namespace clblast {
-// =================================================================================================
-
-const Database::DatabaseEntry Database::PadtransposeHalf = {
- "Padtranspose", Precision::kHalf, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::PadtransposeSingle = {
- "Padtranspose", Precision::kSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
- { "Hawaii", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
- { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
- { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
- { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
- { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
- { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
- "Padtranspose", Precision::kComplexSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
- { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
- { "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
- { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::PadtransposeDouble = {
- "Padtranspose", Precision::kDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
- { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- { "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
- { "GeForce GTX 980", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
- { "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
- { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
- "Padtranspose", Precision::kComplexDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
- { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
- { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
- { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-} // namespace clblast
diff --git a/include/internal/database/transpose.h b/include/internal/database/transpose.h
deleted file mode 100644
index 0c893dae..00000000
--- a/include/internal/database/transpose.h
+++ /dev/null
@@ -1,258 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Database generator <database.py>
-//
-// This file populates the database with best-found tuning parameters for the 'Transpose' kernels.
-//
-// =================================================================================================
-
-namespace clblast {
-// =================================================================================================
-
-const Database::DatabaseEntry Database::TransposeHalf = {
- "Transpose", Precision::kHalf, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::TransposeSingle = {
- "Transpose", Precision::kSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
- { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
- { "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
- { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
- { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
- { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
- { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- { "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- { "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- { "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::TransposeComplexSingle = {
- "Transpose", Precision::kComplexSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
- { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
- { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::TransposeDouble = {
- "Transpose", Precision::kDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
- { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
- { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "default", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
- { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
- { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
- { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::TransposeComplexDouble = {
- "Transpose", Precision::kComplexDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
- { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
- { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "GeForce GTX 980", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-} // namespace clblast
diff --git a/include/internal/database/xaxpy.h b/include/internal/database/xaxpy.h
deleted file mode 100644
index 6e6719e8..00000000
--- a/include/internal/database/xaxpy.h
+++ /dev/null
@@ -1,270 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Database generator <database.py>
-//
-// This file populates the database with best-found tuning parameters for the 'Xaxpy' kernels.
-//
-// =================================================================================================
-
-namespace clblast {
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XaxpyHalf = {
- "Xaxpy", Precision::kHalf, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
- { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XaxpySingle = {
- "Xaxpy", Precision::kSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",2} } },
- { "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
- { "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
- { "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",1}, {"WGS",512}, {"WPT",2} } },
- { "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",1024}, {"WPT",2} } },
- { "default", { {"VW",2}, {"WGS",1024}, {"WPT",2} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
- { "GeForce GTX 480", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
- { "GeForce GTX 680", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
- { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
- { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
- { "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
- { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
- { "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XaxpyComplexSingle = {
- "Xaxpy", Precision::kComplexSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } },
- { "Hawaii", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
- { "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
- { "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
- { "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
- { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
- { "GeForce GTX 480", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
- { "GeForce GTX 680", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
- { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
- { "GeForce GTX 980", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
- { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
- { "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- { "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XaxpyDouble = {
- "Xaxpy", Precision::kDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
- { "Hawaii", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
- { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"VW",2}, {"WGS",128}, {"WPT",2} } },
- { "default", { {"VW",2}, {"WGS",128}, {"WPT",2} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
- { "default", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "GeForce GTX 480", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
- { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
- { "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
- { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
- { "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
- { "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XaxpyComplexDouble = {
- "Xaxpy", Precision::kComplexDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
- { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- { "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"VW",1}, {"WGS",64}, {"WPT",8} } },
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",8} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",8}, {"WGS",128}, {"WPT",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
- { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
- { "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
- { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
- { "Tesla K20m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "Tesla K40m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-} // namespace clblast
diff --git a/include/internal/database/xdot.h b/include/internal/database/xdot.h
deleted file mode 100644
index d09d8c62..00000000
--- a/include/internal/database/xdot.h
+++ /dev/null
@@ -1,200 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Database generator <database.py>
-//
-// This file populates the database with best-found tuning parameters for the 'Xdot' kernels.
-//
-// =================================================================================================
-
-namespace clblast {
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XdotHalf = {
- "Xdot", Precision::kHalf, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
- { "default", { {"WGS1",32}, {"WGS2",32} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",32}, {"WGS2",32} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XdotSingle = {
- "Xdot", Precision::kSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } },
- { "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
- { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
- { "Tahiti", { {"WGS1",128}, {"WGS2",32} } },
- { "default", { {"WGS1",128}, {"WGS2",32} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
- { "default", { {"WGS1",1024}, {"WGS2",32} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",32} } },
- { "Iris Pro", { {"WGS1",512}, {"WGS2",64} } },
- { "default", { {"WGS1",64}, {"WGS2",32} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"WGS1",128}, {"WGS2",32} } },
- { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
- { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } },
- { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } },
- { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
- { "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } },
- { "default", { {"WGS1",128}, {"WGS2",32} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",64}, {"WGS2",32} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XdotComplexSingle = {
- "Xdot", Precision::kComplexSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } },
- { "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
- { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
- { "Tahiti", { {"WGS1",64}, {"WGS2",32} } },
- { "default", { {"WGS1",64}, {"WGS2",32} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
- { "default", { {"WGS1",1024}, {"WGS2",32} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
- { "Iris Pro", { {"WGS1",32}, {"WGS2",32} } },
- { "default", { {"WGS1",32}, {"WGS2",32} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"WGS1",64}, {"WGS2",32} } },
- { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
- { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
- { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } },
- { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
- { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
- { "default", { {"WGS1",64}, {"WGS2",32} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",32}, {"WGS2",32} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XdotDouble = {
- "Xdot", Precision::kDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } },
- { "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
- { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
- { "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
- { "default", { {"WGS1",64}, {"WGS2",32} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",64} } },
- { "default", { {"WGS1",512}, {"WGS2",64} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"WGS1",128}, {"WGS2",32} } },
- { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
- { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
- { "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } },
- { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
- { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
- { "default", { {"WGS1",128}, {"WGS2",32} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",64}, {"WGS2",32} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XdotComplexDouble = {
- "Xdot", Precision::kComplexDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } },
- { "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
- { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
- { "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
- { "default", { {"WGS1",64}, {"WGS2",32} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
- { "default", { {"WGS1",1024}, {"WGS2",32} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"WGS1",64}, {"WGS2",32} } },
- { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
- { "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } },
- { "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } },
- { "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } },
- { "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } },
- { "default", { {"WGS1",64}, {"WGS2",32} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",64}, {"WGS2",32} } },
- }
- },
- }
-};
-
-// =================================================================================================
-} // namespace clblast
diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h
deleted file mode 100644
index f35d2c88..00000000
--- a/include/internal/database/xgemm.h
+++ /dev/null
@@ -1,263 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Database generator <database.py>
-//
-// This file populates the database with best-found tuning parameters for the 'Xgemm' kernels.
-//
-// =================================================================================================
-
-namespace clblast {
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgemmHalf = {
- "Xgemm", Precision::kHalf, {
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgemmSingle = {
- "Xgemm", Precision::kSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
- { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
- { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
- { "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
- { "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
- { "default", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
- { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
- { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
- { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
- { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
- { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
- { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
- { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
- { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgemmComplexSingle = {
- "Xgemm", Precision::kComplexSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
- { "Hawaii", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
- { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
- { "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
- { "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
- { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- { "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
- { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
- { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- { "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgemmDouble = {
- "Xgemm", Precision::kDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
- { "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- { "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
- { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
- { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
- { "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
- { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
- { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgemmComplexDouble = {
- "Xgemm", Precision::kComplexDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
- { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } },
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
- { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
- { "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-} // namespace clblast
diff --git a/include/internal/database/xgemv.h b/include/internal/database/xgemv.h
deleted file mode 100644
index 6b76c8ac..00000000
--- a/include/internal/database/xgemv.h
+++ /dev/null
@@ -1,231 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Database generator <database.py>
-//
-// This file populates the database with best-found tuning parameters for the 'Xgemv' kernels.
-//
-// =================================================================================================
-
-namespace clblast {
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgemvHalf = {
- "Xgemv", Precision::kHalf, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgemvSingle = {
- "Xgemv", Precision::kSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",4} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
- { "Iris", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
- { "Iris Pro", { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"WGS1",256}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
- { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
- { "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
- { "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
- { "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
- { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
- { "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
- { "Tesla K20m", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
- { "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgemvComplexSingle = {
- "Xgemv", Precision::kComplexSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
- { "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "Pitcairn", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
- { "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
- { "Iris", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "Iris Pro", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
- { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgemvDouble = {
- "Xgemv", Precision::kDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
- { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
- { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",2} } },
- { "default", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
- { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
- { "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
- { "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
- { "Tesla K20m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgemvComplexDouble = {
- "Xgemv", Precision::kComplexDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
- { "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
- { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
- }
- },
- { // Intel accelerators
- kDeviceTypeAccelerator, "Intel", {
- { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
- { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-} // namespace clblast
diff --git a/include/internal/database/xger.h b/include/internal/database/xger.h
deleted file mode 100644
index f2e0a36f..00000000
--- a/include/internal/database/xger.h
+++ /dev/null
@@ -1,220 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Database generator <database.py>
-//
-// This file populates the database with best-found tuning parameters for the 'Xger' kernels.
-//
-// =================================================================================================
-
-namespace clblast {
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgerHalf = {
- "Xger", Precision::kHalf, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
- { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgerSingle = {
- "Xger", Precision::kSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
- { "Hawaii", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
- { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
- { "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
- { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } },
- { "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
- { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
- { "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
- { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",4} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
- { "GeForce GTX 480", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
- { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
- { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
- { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgerComplexSingle = {
- "Xger", Precision::kComplexSingle, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
- { "Hawaii", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
- { "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
- { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
- { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
- { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
- { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
- }
- },
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
- { "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
- { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
- { "GeForce GTX 480", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
- { "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
- { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
- { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgerDouble = {
- "Xger", Precision::kDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
- { "Hawaii", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
- { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
- { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
- { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
- { "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
- { "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } },
- { "GeForce GTX 480", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
- { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
- { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
- { "default", { {"WGS1",16}, {"WGS2",4}, {"WPT",2} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const Database::DatabaseEntry Database::XgerComplexDouble = {
- "Xger", Precision::kComplexDouble, {
- { // AMD GPUs
- kDeviceTypeGPU, "AMD", {
- { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
- { "Hawaii", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
- { "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
- { "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
- { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
- }
- },
- { // ARM GPUs
- kDeviceTypeGPU, "ARM", {
- { "Mali-T628", { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } },
- { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } },
- }
- },
- { // Intel CPUs
- kDeviceTypeCPU, "Intel", {
- { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
- { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
- { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "GRID K520", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
- { "GeForce GTX 480", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
- { "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
- { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
- { "default", { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
- }
- },
- }
-};
-
-// =================================================================================================
-} // namespace clblast
diff --git a/include/internal/public_api.h b/include/internal/public_api.h
deleted file mode 100644
index d0732297..00000000
--- a/include/internal/public_api.h
+++ /dev/null
@@ -1,34 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file provides macro's to define the public API. This is needed when building a Windows DLL.
-// Note: this is only used for the C++ interface, the C interface has its own definition included in
-// the header file itself.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_PUBLIC_API_H_
-#define CLBLAST_PUBLIC_API_H_
-
-namespace clblast {
-// =================================================================================================
-
-// Exports library functions under Windows when building a DLL. See also:
-// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
-#ifdef _WIN32
- #define PUBLIC_API __declspec(dllexport)
-#else
- #define PUBLIC_API
-#endif
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_PUBLIC_API_H_
-#endif
diff --git a/include/internal/routine.h b/include/internal/routine.h
deleted file mode 100644
index a6a59d77..00000000
--- a/include/internal/routine.h
+++ /dev/null
@@ -1,68 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements all the basic functionality for the BLAS routines. This class serves as a
-// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
-// compiling the OpenCL kernel, connecting to the database, etc.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINE_H_
-#define CLBLAST_ROUTINE_H_
-
-#include <string>
-#include <vector>
-
-#include "internal/cache.h"
-#include "internal/utilities.h"
-#include "internal/database.h"
-#include "internal/buffer_test.h"
-#include "internal/routines/common.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-class Routine {
- public:
-
- // Base class constructor
- explicit Routine(Queue &queue, EventPointer event, const std::string &name,
- const std::vector<std::string> &routines, const Precision precision);
-
- // Set-up phase of the kernel
- StatusCode SetUp();
-
- protected:
-
- // Non-static variable for the precision
- const Precision precision_;
-
- // The routine's name and its kernel-source in string form
- const std::string routine_name_;
- std::string source_string_;
-
- // The OpenCL objects, accessible only from derived classes
- Queue queue_;
- EventPointer event_;
- const Context context_;
- const Device device_;
-
- // OpenCL device properties
- const std::string device_name_;
-
- // Connection to the database for all the device-specific parameters
- const Database db_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINE_H_
-#endif
diff --git a/include/internal/routines/common.h b/include/internal/routines/common.h
deleted file mode 100644
index 308785bd..00000000
--- a/include/internal/routines/common.h
+++ /dev/null
@@ -1,173 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains all the interfaces to common kernels, such as copying, padding, and
-// transposing a matrix. These functions are templated and thus header-only. This file also contains
-// other common functions to routines, such as a function to launch a kernel.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_COMMON_H_
-#define CLBLAST_ROUTINES_COMMON_H_
-
-#include <string>
-#include <vector>
-
-#include "clblast.h"
-#include "internal/clpp11.h"
-#include "internal/database.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Enqueues a kernel, waits for completion, and checks for errors
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
- std::vector<size_t> global, const std::vector<size_t> &local,
- EventPointer event, std::vector<Event>& waitForEvents);
-
-// As above, but without an event waiting list
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
- std::vector<size_t> global, const std::vector<size_t> &local,
- EventPointer event);
-
-// =================================================================================================
-
-// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
-// to write to symmetric and triangular matrices through optional arguments.
-template <typename T>
-StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context,
- const Database &db,
- EventPointer event, std::vector<Event>& waitForEvents,
- const size_t src_one, const size_t src_two,
- const size_t src_ld, const size_t src_offset,
- const Buffer<T> &src,
- const size_t dest_one, const size_t dest_two,
- const size_t dest_ld, const size_t dest_offset,
- const Buffer<T> &dest,
- const T alpha,
- const Program &program, const bool do_pad,
- const bool do_transpose, const bool do_conjugate,
- const bool upper = false, const bool lower = false,
- const bool diagonal_imag_zero = false) {
-
- // Determines whether or not the fast-version could potentially be used
- auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
- (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
- (upper == false) && (lower == false) && (diagonal_imag_zero == false);
-
- // Determines the right kernel
- auto kernel_name = std::string{};
- if (do_transpose) {
- if (use_fast_kernel &&
- IsMultiple(src_ld, db["TRA_WPT"]) &&
- IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
- IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
- kernel_name = "TransposeMatrixFast";
- }
- else {
- use_fast_kernel = false;
- kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
- }
- }
- else {
- if (use_fast_kernel &&
- IsMultiple(src_ld, db["COPY_VW"]) &&
- IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) &&
- IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) {
- kernel_name = "CopyMatrixFast";
- }
- else {
- use_fast_kernel = false;
- kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
- }
- }
-
- // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
- auto alpha_buffer = Buffer<T>(context, 1);
- alpha_buffer.Write(queue, 1, &alpha);
-
- // Retrieves the kernel from the compiled binary
- try {
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the kernel arguments
- if (use_fast_kernel) {
- kernel.SetArgument(0, static_cast<int>(src_ld));
- kernel.SetArgument(1, src());
- kernel.SetArgument(2, dest());
- kernel.SetArgument(3, alpha_buffer());
- }
- else {
- kernel.SetArgument(0, static_cast<int>(src_one));
- kernel.SetArgument(1, static_cast<int>(src_two));
- kernel.SetArgument(2, static_cast<int>(src_ld));
- kernel.SetArgument(3, static_cast<int>(src_offset));
- kernel.SetArgument(4, src());
- kernel.SetArgument(5, static_cast<int>(dest_one));
- kernel.SetArgument(6, static_cast<int>(dest_two));
- kernel.SetArgument(7, static_cast<int>(dest_ld));
- kernel.SetArgument(8, static_cast<int>(dest_offset));
- kernel.SetArgument(9, dest());
- kernel.SetArgument(10, alpha_buffer());
- if (do_pad) {
- kernel.SetArgument(11, static_cast<int>(do_conjugate));
- }
- else {
- kernel.SetArgument(11, static_cast<int>(upper));
- kernel.SetArgument(12, static_cast<int>(lower));
- kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
- }
- }
-
- // Launches the kernel and returns the error code. Uses global and local thread sizes based on
- // parameters in the database.
- if (do_transpose) {
- if (use_fast_kernel) {
- const auto global = std::vector<size_t>{
- dest_one / db["TRA_WPT"],
- dest_two / db["TRA_WPT"]
- };
- const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
- return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
- }
- else {
- const auto global = std::vector<size_t>{
- Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
- Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
- };
- const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
- return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
- }
- }
- else {
- if (use_fast_kernel) {
- const auto global = std::vector<size_t>{
- dest_one / db["COPY_VW"],
- dest_two / db["COPY_WPT"]
- };
- const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
- return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
- }
- else {
- const auto global = std::vector<size_t>{
- Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
- Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
- };
- const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
- return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
- }
- }
- } catch (...) { return StatusCode::kInvalidKernel; }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_COMMON_H_
-#endif
diff --git a/include/internal/routines/level1/xamax.h b/include/internal/routines/level1/xamax.h
deleted file mode 100644
index 42f8f67c..00000000
--- a/include/internal/routines/level1/xamax.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xamax routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XAMAX_H_
-#define CLBLAST_ROUTINES_XAMAX_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xamax: public Routine {
- public:
-
- // Constructor
- Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
-
- // Templated-precision implementation of the routine
- StatusCode DoAmax(const size_t n,
- const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XAMAX_H_
-#endif
diff --git a/include/internal/routines/level1/xasum.h b/include/internal/routines/level1/xasum.h
deleted file mode 100644
index 9d93a6f4..00000000
--- a/include/internal/routines/level1/xasum.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xasum routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XASUM_H_
-#define CLBLAST_ROUTINES_XASUM_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xasum: public Routine {
- public:
-
- // Constructor
- Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
-
- // Templated-precision implementation of the routine
- StatusCode DoAsum(const size_t n,
- const Buffer<T> &asum_buffer, const size_t asum_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XASUM_H_
-#endif
diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h
deleted file mode 100644
index 4c8d2c1f..00000000
--- a/include/internal/routines/level1/xaxpy.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xaxpy routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XAXPY_H_
-#define CLBLAST_ROUTINES_XAXPY_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xaxpy: public Routine {
- public:
-
- // Constructor
- Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
-
- // Templated-precision implementation of the routine
- StatusCode DoAxpy(const size_t n, const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XAXPY_H_
-#endif
diff --git a/include/internal/routines/level1/xcopy.h b/include/internal/routines/level1/xcopy.h
deleted file mode 100644
index c7d03dd0..00000000
--- a/include/internal/routines/level1/xcopy.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xcopy routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XCOPY_H_
-#define CLBLAST_ROUTINES_XCOPY_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xcopy: public Routine {
- public:
-
- // Constructor
- Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
-
- // Templated-precision implementation of the routine
- StatusCode DoCopy(const size_t n,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XCOPY_H_
-#endif
diff --git a/include/internal/routines/level1/xdot.h b/include/internal/routines/level1/xdot.h
deleted file mode 100644
index e1968740..00000000
--- a/include/internal/routines/level1/xdot.h
+++ /dev/null
@@ -1,42 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xdot routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XDOT_H_
-#define CLBLAST_ROUTINES_XDOT_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xdot: public Routine {
- public:
-
- // Constructor
- Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
-
- // Templated-precision implementation of the routine
- StatusCode DoDot(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const bool do_conjugate = false);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XDOT_H_
-#endif
diff --git a/include/internal/routines/level1/xdotc.h b/include/internal/routines/level1/xdotc.h
deleted file mode 100644
index 0dc2cfe9..00000000
--- a/include/internal/routines/level1/xdotc.h
+++ /dev/null
@@ -1,44 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xdotc routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XDOTC_H_
-#define CLBLAST_ROUTINES_XDOTC_H_
-
-#include "internal/routines/level1/xdot.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xdotc: public Xdot<T> {
- public:
-
- // Uses the regular Xdot routine
- using Xdot<T>::DoDot;
-
- // Constructor
- Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");
-
- // Templated-precision implementation of the routine
- StatusCode DoDotc(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XDOTC_H_
-#endif
diff --git a/include/internal/routines/level1/xdotu.h b/include/internal/routines/level1/xdotu.h
deleted file mode 100644
index 98988744..00000000
--- a/include/internal/routines/level1/xdotu.h
+++ /dev/null
@@ -1,44 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xdotu routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XDOTU_H_
-#define CLBLAST_ROUTINES_XDOTU_H_
-
-#include "internal/routines/level1/xdot.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xdotu: public Xdot<T> {
- public:
-
- // Uses the regular Xdot routine
- using Xdot<T>::DoDot;
-
- // Constructor
- Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");
-
- // Templated-precision implementation of the routine
- StatusCode DoDotu(const size_t n,
- const Buffer<T> &dot_buffer, const size_t dot_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XDOTU_H_
-#endif
diff --git a/include/internal/routines/level1/xmax.h b/include/internal/routines/level1/xmax.h
deleted file mode 100644
index a872cede..00000000
--- a/include/internal/routines/level1/xmax.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xmax routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XMAX_H_
-#define CLBLAST_ROUTINES_XMAX_H_
-
-#include "internal/routine.h"
-#include "internal/routines/level1/xamax.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xmax: public Xamax<T> {
- public:
-
- // Members and methods from the base class
- using Xamax<T>::DoAmax;
-
- // Constructor
- Xmax(Queue &queue, EventPointer event, const std::string &name = "MAX"):
- Xamax<T>(queue, event, name) {
- }
-
- // Forwards to the regular absolute version. The implementation difference is realised in the
- // kernel through a pre-processor macro based on the name of the routine.
- StatusCode DoMax(const size_t n,
- const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
- return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
- }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XMAX_H_
-#endif
diff --git a/include/internal/routines/level1/xmin.h b/include/internal/routines/level1/xmin.h
deleted file mode 100644
index 700c81cc..00000000
--- a/include/internal/routines/level1/xmin.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xmin routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XMIN_H_
-#define CLBLAST_ROUTINES_XMIN_H_
-
-#include "internal/routine.h"
-#include "internal/routines/level1/xamax.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xmin: public Xamax<T> {
- public:
-
- // Members and methods from the base class
- using Xamax<T>::DoAmax;
-
- // Constructor
- Xmin(Queue &queue, EventPointer event, const std::string &name = "MIN"):
- Xamax<T>(queue, event, name) {
- }
-
- // Forwards to the regular max-absolute version. The implementation difference is realised in the
- // kernel through a pre-processor macro based on the name of the routine.
- StatusCode DoMin(const size_t n,
- const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
- return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
- }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XMIN_H_
-#endif
diff --git a/include/internal/routines/level1/xnrm2.h b/include/internal/routines/level1/xnrm2.h
deleted file mode 100644
index ca9268c0..00000000
--- a/include/internal/routines/level1/xnrm2.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xnrm2 routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XNRM2_H_
-#define CLBLAST_ROUTINES_XNRM2_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xnrm2: public Routine {
- public:
-
- // Constructor
- Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
-
- // Templated-precision implementation of the routine
- StatusCode DoNrm2(const size_t n,
- const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XNRM2_H_
-#endif
diff --git a/include/internal/routines/level1/xscal.h b/include/internal/routines/level1/xscal.h
deleted file mode 100644
index b9430f3b..00000000
--- a/include/internal/routines/level1/xscal.h
+++ /dev/null
@@ -1,39 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xscal routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSCAL_H_
-#define CLBLAST_ROUTINES_XSCAL_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xscal: public Routine {
- public:
-
- // Constructor
- Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
-
- // Templated-precision implementation of the routine
- StatusCode DoScal(const size_t n, const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSCAL_H_
-#endif
diff --git a/include/internal/routines/level1/xsum.h b/include/internal/routines/level1/xsum.h
deleted file mode 100644
index 2f633b52..00000000
--- a/include/internal/routines/level1/xsum.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsum routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSUM_H_
-#define CLBLAST_ROUTINES_XSUM_H_
-
-#include "internal/routine.h"
-#include "internal/routines/level1/xasum.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xsum: public Xasum<T> {
- public:
-
- // Members and methods from the base class
- using Xasum<T>::DoAsum;
-
- // Constructor
- Xsum(Queue &queue, EventPointer event, const std::string &name = "SUM"):
- Xasum<T>(queue, event, name) {
- }
-
- // Forwards to the regular absolute version. The implementation difference is realised in the
- // kernel through a pre-processor macro based on the name of the routine.
- StatusCode DoSum(const size_t n,
- const Buffer<T> &sum_buffer, const size_t sum_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
- return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
- }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSUM_H_
-#endif
diff --git a/include/internal/routines/level1/xswap.h b/include/internal/routines/level1/xswap.h
deleted file mode 100644
index bd063afc..00000000
--- a/include/internal/routines/level1/xswap.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xswap routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSWAP_H_
-#define CLBLAST_ROUTINES_XSWAP_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xswap: public Routine {
- public:
-
- // Constructor
- Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
-
- // Templated-precision implementation of the routine
- StatusCode DoSwap(const size_t n,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSWAP_H_
-#endif
diff --git a/include/internal/routines/level2/xgbmv.h b/include/internal/routines/level2/xgbmv.h
deleted file mode 100644
index bc94c77d..00000000
--- a/include/internal/routines/level2/xgbmv.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgbmv routine. It is based on the generalized mat-vec multiplication
-// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the
-// "MatVec" function directly.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XGBMV_H_
-#define CLBLAST_ROUTINES_XGBMV_H_
-
-#include "internal/routines/level2/xgemv.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xgbmv: public Xgemv<T> {
- public:
-
- // Uses the generic matrix-vector routine
- using Xgemv<T>::MatVec;
-
- // Constructor
- Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");
-
- // Templated-precision implementation of the routine
- StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const size_t kl, const size_t ku,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XGBMV_H_
-#endif
diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h
deleted file mode 100644
index e9804c62..00000000
--- a/include/internal/routines/level2/xgemv.h
+++ /dev/null
@@ -1,56 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgemv routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XGEMV_H_
-#define CLBLAST_ROUTINES_XGEMV_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xgemv: public Routine {
- public:
-
- // Constructor
- Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
-
- // Templated-precision implementation of the routine
- StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-
- // Generic version used also for other matrix-vector multiplications
- StatusCode MatVec(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- bool fast_kernel, bool fast_kernel_rot,
- const size_t parameter, const bool packed,
- const size_t kl, const size_t ku);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XGEMV_H_
-#endif
diff --git a/include/internal/routines/level2/xger.h b/include/internal/routines/level2/xger.h
deleted file mode 100644
index 184f8477..00000000
--- a/include/internal/routines/level2/xger.h
+++ /dev/null
@@ -1,43 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xger routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XGER_H_
-#define CLBLAST_ROUTINES_XGER_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xger: public Routine {
- public:
-
- // Constructor
- Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
-
- // Templated-precision implementation of the routine
- StatusCode DoGer(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XGER_H_
-#endif
diff --git a/include/internal/routines/level2/xgerc.h b/include/internal/routines/level2/xgerc.h
deleted file mode 100644
index 6d06ef94..00000000
--- a/include/internal/routines/level2/xgerc.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgerc routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XGERC_H_
-#define CLBLAST_ROUTINES_XGERC_H_
-
-#include "internal/routines/level2/xger.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xgerc: public Xger<T> {
- public:
-
- // Uses the regular Xger routine
- using Xger<T>::DoGer;
-
- // Constructor
- Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");
-
- // Templated-precision implementation of the routine
- StatusCode DoGerc(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XGERC_H_
-#endif
diff --git a/include/internal/routines/level2/xgeru.h b/include/internal/routines/level2/xgeru.h
deleted file mode 100644
index 45ce1cba..00000000
--- a/include/internal/routines/level2/xgeru.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgeru routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XGERU_H_
-#define CLBLAST_ROUTINES_XGERU_H_
-
-#include "internal/routines/level2/xger.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xgeru: public Xger<T> {
- public:
-
- // Uses the regular Xger routine
- using Xger<T>::DoGer;
-
- // Constructor
- Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");
-
- // Templated-precision implementation of the routine
- StatusCode DoGeru(const Layout layout,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XGERU_H_
-#endif
diff --git a/include/internal/routines/level2/xhbmv.h b/include/internal/routines/level2/xhbmv.h
deleted file mode 100644
index f0a6212c..00000000
--- a/include/internal/routines/level2/xhbmv.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xhbmv routine. It is based on the generalized mat-vec multiplication
-// routine (Xgemv). The Xhbmv class inherits from the templated class Xgemv, allowing it to call the
-// "MatVec" function directly.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHBMV_H_
-#define CLBLAST_ROUTINES_XHBMV_H_
-
-#include "internal/routines/level2/xgemv.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xhbmv: public Xgemv<T> {
- public:
-
- // Uses the generic matrix-vector routine
- using Xgemv<T>::MatVec;
-
- // Constructor
- Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");
-
- // Templated-precision implementation of the routine
- StatusCode DoHbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHBMV_H_
-#endif
diff --git a/include/internal/routines/level2/xhemv.h b/include/internal/routines/level2/xhemv.h
deleted file mode 100644
index 3daf2457..00000000
--- a/include/internal/routines/level2/xhemv.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xhemv routine. It is based on the generalized mat-vec multiplication
-// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the
-// "MatVec" function directly.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHEMV_H_
-#define CLBLAST_ROUTINES_XHEMV_H_
-
-#include "internal/routines/level2/xgemv.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xhemv: public Xgemv<T> {
- public:
-
- // Uses the generic matrix-vector routine
- using Xgemv<T>::MatVec;
-
- // Constructor
- Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");
-
- // Templated-precision implementation of the routine
- StatusCode DoHemv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHEMV_H_
-#endif
diff --git a/include/internal/routines/level2/xher.h b/include/internal/routines/level2/xher.h
deleted file mode 100644
index fca8bb97..00000000
--- a/include/internal/routines/level2/xher.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xher routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHER_H_
-#define CLBLAST_ROUTINES_XHER_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T, typename U>
-class Xher: public Routine {
- public:
-
- // Constructor
- Xher(Queue &queue, EventPointer event, const std::string &name = "HER");
-
- // Translates alpha of type 'U' into type 'T'
- T GetAlpha(const U alpha);
-
- // Templated-precision implementation of the routine
- StatusCode DoHer(const Layout layout, const Triangle triangle,
- const size_t n,
- const U alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const bool packed = false);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHER_H_
-#endif
diff --git a/include/internal/routines/level2/xher2.h b/include/internal/routines/level2/xher2.h
deleted file mode 100644
index 9a7610f1..00000000
--- a/include/internal/routines/level2/xher2.h
+++ /dev/null
@@ -1,44 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xher2 routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHER2_H_
-#define CLBLAST_ROUTINES_XHER2_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xher2: public Routine {
- public:
-
- // Constructor
- Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
-
- // Templated-precision implementation of the routine
- StatusCode DoHer2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const bool packed = false);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHER2_H_
-#endif
diff --git a/include/internal/routines/level2/xhpmv.h b/include/internal/routines/level2/xhpmv.h
deleted file mode 100644
index a1d5595a..00000000
--- a/include/internal/routines/level2/xhpmv.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xhpmv routine. It is based on the generalized mat-vec multiplication
-// routine (Xgemv). The Xhpmv class inherits from the templated class Xgemv, allowing it to call the
-// "MatVec" function directly.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHPMV_H_
-#define CLBLAST_ROUTINES_XHPMV_H_
-
-#include "internal/routines/level2/xgemv.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xhpmv: public Xgemv<T> {
- public:
-
- // Uses the generic matrix-vector routine
- using Xgemv<T>::MatVec;
-
- // Constructor
- Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");
-
- // Templated-precision implementation of the routine
- StatusCode DoHpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHPMV_H_
-#endif
diff --git a/include/internal/routines/level2/xhpr.h b/include/internal/routines/level2/xhpr.h
deleted file mode 100644
index 6554d74c..00000000
--- a/include/internal/routines/level2/xhpr.h
+++ /dev/null
@@ -1,45 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xhpr routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHPR_H_
-#define CLBLAST_ROUTINES_XHPR_H_
-
-#include "internal/routines/level2/xher.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T, typename U>
-class Xhpr: public Xher<T,U> {
- public:
-
- // Uses the regular Xher routine
- using Xher<T,U>::DoHer;
-
- // Constructor
- Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");
-
- // Templated-precision implementation of the routine
- StatusCode DoHpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const U alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHPR_H_
-#endif
diff --git a/include/internal/routines/level2/xhpr2.h b/include/internal/routines/level2/xhpr2.h
deleted file mode 100644
index d95e7b61..00000000
--- a/include/internal/routines/level2/xhpr2.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xhpr2 routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHPR2_H_
-#define CLBLAST_ROUTINES_XHPR2_H_
-
-#include "internal/routines/level2/xher2.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xhpr2: public Xher2<T> {
- public:
-
- // Uses the regular Xher2 routine
- using Xher2<T>::DoHer2;
-
- // Constructor
- Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");
-
- // Templated-precision implementation of the routine
- StatusCode DoHpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHPR2_H_
-#endif
diff --git a/include/internal/routines/level2/xsbmv.h b/include/internal/routines/level2/xsbmv.h
deleted file mode 100644
index 4328e377..00000000
--- a/include/internal/routines/level2/xsbmv.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsbmv routine. It is based on the generalized mat-vec multiplication
-// routine (Xgemv). The Xsbmv class inherits from the templated class Xgemv, allowing it to call the
-// "MatVec" function directly.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSBMV_H_
-#define CLBLAST_ROUTINES_XSBMV_H_
-
-#include "internal/routines/level2/xgemv.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xsbmv: public Xgemv<T> {
- public:
-
- // Uses the generic matrix-vector routine
- using Xgemv<T>::MatVec;
-
- // Constructor
- Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");
-
- // Templated-precision implementation of the routine
- StatusCode DoSbmv(const Layout layout, const Triangle triangle,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSBMV_H_
-#endif
diff --git a/include/internal/routines/level2/xspmv.h b/include/internal/routines/level2/xspmv.h
deleted file mode 100644
index ca3e28b6..00000000
--- a/include/internal/routines/level2/xspmv.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xspmv routine. It is based on the generalized mat-vec multiplication
-// routine (Xgemv). The Xspmv class inherits from the templated class Xgemv, allowing it to call the
-// "MatVec" function directly.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSPMV_H_
-#define CLBLAST_ROUTINES_XSPMV_H_
-
-#include "internal/routines/level2/xgemv.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xspmv: public Xgemv<T> {
- public:
-
- // Uses the generic matrix-vector routine
- using Xgemv<T>::MatVec;
-
- // Constructor
- Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");
-
- // Templated-precision implementation of the routine
- StatusCode DoSpmv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSPMV_H_
-#endif
diff --git a/include/internal/routines/level2/xspr.h b/include/internal/routines/level2/xspr.h
deleted file mode 100644
index 7e91abc5..00000000
--- a/include/internal/routines/level2/xspr.h
+++ /dev/null
@@ -1,45 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xspr routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSPR_H_
-#define CLBLAST_ROUTINES_XSPR_H_
-
-#include "internal/routines/level2/xher.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xspr: public Xher<T,T> {
- public:
-
- // Uses the regular Xher routine
- using Xher<T,T>::DoHer;
-
- // Constructor
- Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");
-
- // Templated-precision implementation of the routine
- StatusCode DoSpr(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSPR_H_
-#endif
diff --git a/include/internal/routines/level2/xspr2.h b/include/internal/routines/level2/xspr2.h
deleted file mode 100644
index a34be8e8..00000000
--- a/include/internal/routines/level2/xspr2.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xspr2 routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSPR2_H_
-#define CLBLAST_ROUTINES_XSPR2_H_
-
-#include "internal/routines/level2/xher2.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xspr2: public Xher2<T> {
- public:
-
- // Uses the regular Xher2 routine
- using Xher2<T>::DoHer2;
-
- // Constructor
- Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");
-
- // Templated-precision implementation of the routine
- StatusCode DoSpr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &ap_buffer, const size_t ap_offset);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSPR2_H_
-#endif
diff --git a/include/internal/routines/level2/xsymv.h b/include/internal/routines/level2/xsymv.h
deleted file mode 100644
index 98a0ce88..00000000
--- a/include/internal/routines/level2/xsymv.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsymv routine. It is based on the generalized mat-vec multiplication
-// routine (Xgemv). The Xsymv class inherits from the templated class Xgemv, allowing it to call the
-// "MatVec" function directly.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSYMV_H_
-#define CLBLAST_ROUTINES_XSYMV_H_
-
-#include "internal/routines/level2/xgemv.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xsymv: public Xgemv<T> {
- public:
-
- // Uses the generic matrix-vector routine
- using Xgemv<T>::MatVec;
-
- // Constructor
- Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV");
-
- // Templated-precision implementation of the routine
- StatusCode DoSymv(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const T beta,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSYMV_H_
-#endif
diff --git a/include/internal/routines/level2/xsyr.h b/include/internal/routines/level2/xsyr.h
deleted file mode 100644
index f88498ae..00000000
--- a/include/internal/routines/level2/xsyr.h
+++ /dev/null
@@ -1,45 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsyr routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSYR_H_
-#define CLBLAST_ROUTINES_XSYR_H_
-
-#include "internal/routines/level2/xher.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xsyr: public Xher<T,T> {
- public:
-
- // Uses the regular Xher routine
- using Xher<T,T>::DoHer;
-
- // Constructor
- Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR");
-
- // Templated-precision implementation of the routine
- StatusCode DoSyr(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSYR_H_
-#endif
diff --git a/include/internal/routines/level2/xsyr2.h b/include/internal/routines/level2/xsyr2.h
deleted file mode 100644
index d2d3143a..00000000
--- a/include/internal/routines/level2/xsyr2.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsyr2 routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSYR2_H_
-#define CLBLAST_ROUTINES_XSYR2_H_
-
-#include "internal/routines/level2/xher2.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xsyr2: public Xher2<T> {
- public:
-
- // Uses the regular Xher2 routine
- using Xher2<T>::DoHer2;
-
- // Constructor
- Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2");
-
- // Templated-precision implementation of the routine
- StatusCode DoSyr2(const Layout layout, const Triangle triangle,
- const size_t n,
- const T alpha,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSYR2_H_
-#endif
diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h
deleted file mode 100644
index 493a9853..00000000
--- a/include/internal/routines/level2/xtbmv.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication
-// routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the
-// "MatVec" function directly.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XTBMV_H_
-#define CLBLAST_ROUTINES_XTBMV_H_
-
-#include "internal/routines/level2/xgemv.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xtbmv: public Xgemv<T> {
- public:
-
- // Uses the generic matrix-vector routine
- using Xgemv<T>::queue_;
- using Xgemv<T>::context_;
- using Xgemv<T>::MatVec;
-
- // Constructor
- Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV");
-
- // Templated-precision implementation of the routine
- StatusCode DoTbmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n, const size_t k,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XTBMV_H_
-#endif
diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h
deleted file mode 100644
index ce5cae6f..00000000
--- a/include/internal/routines/level2/xtpmv.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication
-// routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the
-// "MatVec" function directly.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XTPMV_H_
-#define CLBLAST_ROUTINES_XTPMV_H_
-
-#include "internal/routines/level2/xgemv.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xtpmv: public Xgemv<T> {
- public:
-
- // Uses the generic matrix-vector routine
- using Xgemv<T>::queue_;
- using Xgemv<T>::context_;
- using Xgemv<T>::MatVec;
-
- // Constructor
- Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV");
-
- // Templated-precision implementation of the routine
- StatusCode DoTpmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const Buffer<T> &ap_buffer, const size_t ap_offset,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XTPMV_H_
-#endif
diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h
deleted file mode 100644
index 4407bad7..00000000
--- a/include/internal/routines/level2/xtrmv.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication
-// routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the
-// "MatVec" function directly.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XTRMV_H_
-#define CLBLAST_ROUTINES_XTRMV_H_
-
-#include "internal/routines/level2/xgemv.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xtrmv: public Xgemv<T> {
- public:
-
- // Uses the generic matrix-vector routine
- using Xgemv<T>::queue_;
- using Xgemv<T>::context_;
- using Xgemv<T>::MatVec;
-
- // Constructor
- Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV");
-
- // Templated-precision implementation of the routine
- StatusCode DoTrmv(const Layout layout, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t n,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XTRMV_H_
-#endif
diff --git a/include/internal/routines/level3/xgemm.h b/include/internal/routines/level3/xgemm.h
deleted file mode 100644
index c0541eef..00000000
--- a/include/internal/routines/level3/xgemm.h
+++ /dev/null
@@ -1,48 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgemm routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XGEMM_H_
-#define CLBLAST_ROUTINES_XGEMM_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xgemm: public Routine {
- public:
-
- // Constructor
- Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");
-
- // Templated-precision implementation of the routine
- StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
- const size_t m, const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-
- protected:
- // Static variable to get the precision
- const static Precision precision_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XGEMM_H_
-#endif
diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h
deleted file mode 100644
index e0f35669..00000000
--- a/include/internal/routines/level3/xhemm.h
+++ /dev/null
@@ -1,54 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xhemm routine. It is based on the generalized matrix multiplication
-// routine (Xgemm). The implementation is very similar to the Xsymm routine.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHEMM_H_
-#define CLBLAST_ROUTINES_XHEMM_H_
-
-#include "internal/routines/level3/xgemm.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xhemm: public Xgemm<T> {
- public:
-
- // Uses methods and variables the regular Xgemm routine
- using Xgemm<T>::precision_;
- using Xgemm<T>::routine_name_;
- using Xgemm<T>::queue_;
- using Xgemm<T>::context_;
- using Xgemm<T>::device_;
- using Xgemm<T>::db_;
- using Xgemm<T>::DoGemm;
-
- // Constructor
- Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM");
-
- // Templated-precision implementation of the routine
- StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHEMM_H_
-#endif
diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h
deleted file mode 100644
index b7764e18..00000000
--- a/include/internal/routines/level3/xher2k.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xher2k routine. The precision is implemented using the template argument
-// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
-// Xsyr2k routine.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHER2K_H_
-#define CLBLAST_ROUTINES_XHER2K_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T, typename U>
-class Xher2k: public Routine {
- public:
-
- // Constructor
- Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");
-
- // Templated-precision implementation of the routine
- StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const U beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHER2K_H_
-#endif
diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h
deleted file mode 100644
index abcf4c1a..00000000
--- a/include/internal/routines/level3/xherk.h
+++ /dev/null
@@ -1,45 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xherk routine. The precision is implemented using the template argument
-// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
-// Xsyrk routine.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHERK_H_
-#define CLBLAST_ROUTINES_XHERK_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T, typename U>
-class Xherk: public Routine {
- public:
-
- // Constructor
- Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");
-
- // Templated-precision implementation of the routine
- StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const U alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const U beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHERK_H_
-#endif
diff --git a/include/internal/routines/level3/xsymm.h b/include/internal/routines/level3/xsymm.h
deleted file mode 100644
index 889abfb7..00000000
--- a/include/internal/routines/level3/xsymm.h
+++ /dev/null
@@ -1,56 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsymm routine. It is based on the generalized matrix multiplication
-// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the
-// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by
-// transforming it into a general matrix, and then calls the regular GEMM code.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSYMM_H_
-#define CLBLAST_ROUTINES_XSYMM_H_
-
-#include "internal/routines/level3/xgemm.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xsymm: public Xgemm<T> {
- public:
-
- // Uses methods and variables the regular Xgemm routine
- using Xgemm<T>::precision_;
- using Xgemm<T>::routine_name_;
- using Xgemm<T>::queue_;
- using Xgemm<T>::context_;
- using Xgemm<T>::device_;
- using Xgemm<T>::db_;
- using Xgemm<T>::DoGemm;
-
- // Constructor
- Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM");
-
- // Templated-precision implementation of the routine
- StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSYMM_H_
-#endif
diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h
deleted file mode 100644
index f75c91e5..00000000
--- a/include/internal/routines/level3/xsyr2k.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsyr2k routine. The precision is implemented using a template argument.
-// The implementation is very similar to Xsyrk (see header for details), except for the fact that
-// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSYR2K_H_
-#define CLBLAST_ROUTINES_XSYR2K_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xsyr2k: public Routine {
- public:
-
- // Constructor
- Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");
-
- // Templated-precision implementation of the routine
- StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSYR2K_H_
-#endif
diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h
deleted file mode 100644
index 0710fa74..00000000
--- a/include/internal/routines/level3/xsyrk.h
+++ /dev/null
@@ -1,47 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsyrk routine. The precision is implemented using a template argument.
-// The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
-// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
-// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
-// performance reasons, as the actual masking is done later (see the first point).
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSYRK_H_
-#define CLBLAST_ROUTINES_XSYRK_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xsyrk: public Routine {
- public:
-
- // Constructor
- Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");
-
- // Templated-precision implementation of the routine
- StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
- const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSYRK_H_
-#endif
diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h
deleted file mode 100644
index e18ad17a..00000000
--- a/include/internal/routines/level3/xtrmm.h
+++ /dev/null
@@ -1,54 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xtrmm routine. The implementation is based on first transforming the
-// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM
-// routine. Therefore, this class inherits from the Xgemm class.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XTRMM_H_
-#define CLBLAST_ROUTINES_XTRMM_H_
-
-#include "internal/routines/level3/xgemm.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xtrmm: public Xgemm<T> {
- public:
-
- // Uses methods and variables the regular Xgemm routine
- using Xgemm<T>::precision_;
- using Xgemm<T>::routine_name_;
- using Xgemm<T>::queue_;
- using Xgemm<T>::context_;
- using Xgemm<T>::device_;
- using Xgemm<T>::db_;
- using Xgemm<T>::DoGemm;
-
- // Constructor
- Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM");
-
- // Templated-precision implementation of the routine
- StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XTRMM_H_
-#endif
diff --git a/include/internal/routines/levelx/xomatcopy.h b/include/internal/routines/levelx/xomatcopy.h
deleted file mode 100644
index d2acb50d..00000000
--- a/include/internal/routines/levelx/xomatcopy.h
+++ /dev/null
@@ -1,41 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xomatcopy routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XOMATCOPY_H_
-#define CLBLAST_ROUTINES_XOMATCOPY_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xomatcopy: public Routine {
- public:
-
- // Constructor
- Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY");
-
- // Templated-precision implementation of the routine
- StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose,
- const size_t m, const size_t n, const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XOMATCOPY_H_
-#endif
diff --git a/include/internal/tuning.h b/include/internal/tuning.h
deleted file mode 100644
index a44f79d6..00000000
--- a/include/internal/tuning.h
+++ /dev/null
@@ -1,161 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the interface to the CLTune auto-tuner. This is only used for the optional
-// and stand-alone tuner binaries and not part of the core of CLBlast.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TUNING_H_
-#define CLBLAST_TUNING_H_
-
-#include <vector>
-#include <string>
-
-#include <cltune.h>
-
-#include "internal/utilities.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
-// the results. Used for all types of kernel families. Note that this is a header-only function so
-// that it is automatically compiled for the various kernels (given as the 'C' template argument).
-template <typename C, typename T>
-void Tuner(int argc, char* argv[]) {
-
- // Sets the parameters and platform/device for which to tune (command-line options)
- auto help = std::string{"* Options given/available:\n"};
- auto args = Arguments<T>{};
- args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
- args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
- args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
- for (auto &o: C::GetOptions()) {
- if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, C::DefaultM()); }
- if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, C::DefaultN()); }
- if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, C::DefaultK()); }
- if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); }
- if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
- if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); }
- }
- fprintf(stdout, "%s\n", help.c_str());
-
- // Tests validity of the given arguments
- C::TestValidArguments(args);
-
- // Tests for validity of the precision and retrieves properties
- auto isAMD = false;
- auto isARM = false;
- auto isGPU = false;
- {
- const auto platform = Platform(args.platform_id);
- const auto device = Device(platform, args.device_id);
- if (!PrecisionSupported<T>(device)) {
- printf("* Unsupported precision, skipping this tuning run\n\n");
- return;
- }
- isAMD = device.IsAMD();
- isARM = device.IsARM();
- isGPU = device.IsGPU();
- }
-
- // Creates input buffers with random data
- auto x_vec = std::vector<T>(C::GetSizeX(args));
- auto y_vec = std::vector<T>(C::GetSizeY(args));
- auto a_mat = std::vector<T>(C::GetSizeA(args));
- auto b_mat = std::vector<T>(C::GetSizeB(args));
- auto c_mat = std::vector<T>(C::GetSizeC(args));
- auto temp = std::vector<T>(C::GetSizeTemp(args));
- PopulateVector(x_vec);
- PopulateVector(y_vec);
- PopulateVector(a_mat);
- PopulateVector(b_mat);
- PopulateVector(c_mat);
- PopulateVector(temp);
-
- // Initializes the tuner for the chosen device
- cltune::Tuner tuner(args.platform_id, args.device_id);
-
- // Use full-search to explore all parameter combinations or random-search to search only a part of
- // the parameter values. The fraction is set as a command-line argument.
- if (args.fraction == 1.0 || args.fraction == 0.0) {
- tuner.UseFullSearch();
- }
- else {
- tuner.UseRandomSearch(1.0/args.fraction);
- }
-
- // Set extra settings for specific defines. This mimics src/routine.cc.
- auto defines = std::string{""};
- if (isAMD && isGPU) {
- defines += "#define USE_CL_MAD 1\n";
- defines += "#define USE_STAGGERED_INDICES 1\n";
- }
- if (isARM && isGPU) {
- defines += "#define GLOBAL_MEM_FENCE 1\n";
- }
-
- // Loads the kernel sources and defines the kernel to tune
- auto sources = defines + C::GetSources();
- auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize());
- tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef());
-
- // Sets the tunable parameters and their possible values
- C::SetParameters(tuner, id);
- C::SetConstraints(tuner, id);
- C::SetLocalMemorySize(tuner, id, args);
-
- // Tests for a specific precision
- tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
- tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
-
- // Modifies the thread-sizes (both global and local) based on the parameters
- for (auto &parameters: C::MulLocal()) { tuner.MulLocalSize(id, parameters); }
- for (auto &parameters: C::DivLocal()) { tuner.DivLocalSize(id, parameters); }
- for (auto &parameters: C::MulGlobal()) { tuner.MulGlobalSize(id, parameters); }
- for (auto &parameters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); }
-
- // Sets the function's arguments
- C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);
-
- // Starts the tuning process
- tuner.Tune();
-
- // Prints the results to screen
- auto time_ms = tuner.PrintToScreen();
- tuner.PrintFormatted();
-
- // Also prints the performance of the best-case in terms of GB/s or GFLOPS
- if (time_ms != 0.0) {
- printf("[ -------> ] %.1lf ms", time_ms);
- printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str());
- }
-
- // Outputs the results as JSON to disk, including some meta-data
- auto precision_string = std::to_string(static_cast<size_t>(args.precision));
- auto metadata = std::vector<std::pair<std::string,std::string>>{
- {"kernel_family", C::KernelFamily()},
- {"precision", precision_string}
- };
- for (auto &o: C::GetOptions()) {
- if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); }
- if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); }
- if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); }
- if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
- if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); }
- }
- tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata);
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TUNING_H_
-#endif
diff --git a/include/internal/utilities.h b/include/internal/utilities.h
deleted file mode 100644
index 7092bcdd..00000000
--- a/include/internal/utilities.h
+++ /dev/null
@@ -1,257 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file provides declarations for the common (test) utility functions such as a command-line
-// argument parser. On top of this, it serves as the 'common' header, including the C++ OpenCL
-// wrapper. These utilities are not only used for CLBlast, but also included as part of the tuners,
-// the performance client and the correctness testers.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_UTILITIES_H_
-#define CLBLAST_UTILITIES_H_
-
-#include <string>
-#include <functional>
-#include <complex>
-
-#include "clblast.h"
-#include "clblast_half.h"
-#include "internal/clpp11.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Shorthands for complex data-types
-using float2 = std::complex<float>;
-using double2 = std::complex<double>;
-
-// Khronos OpenCL extensions
-const std::string kKhronosHalfPrecision = "cl_khr_fp16";
-const std::string kKhronosDoublePrecision = "cl_khr_fp64";
-
-// Catched an unknown error
-constexpr auto kUnknownError = -999;
-
-// =================================================================================================
-
-// The routine-specific arguments in string form
-constexpr auto kArgM = "m";
-constexpr auto kArgN = "n";
-constexpr auto kArgK = "k";
-constexpr auto kArgKL = "kl";
-constexpr auto kArgKU = "ku";
-constexpr auto kArgLayout = "layout";
-constexpr auto kArgATransp = "transA";
-constexpr auto kArgBTransp = "transB";
-constexpr auto kArgSide = "side";
-constexpr auto kArgTriangle = "triangle";
-constexpr auto kArgDiagonal = "diagonal";
-constexpr auto kArgXInc = "incx";
-constexpr auto kArgYInc = "incy";
-constexpr auto kArgXOffset = "offx";
-constexpr auto kArgYOffset = "offy";
-constexpr auto kArgALeadDim = "lda";
-constexpr auto kArgBLeadDim = "ldb";
-constexpr auto kArgCLeadDim = "ldc";
-constexpr auto kArgAOffset = "offa";
-constexpr auto kArgBOffset = "offb";
-constexpr auto kArgCOffset = "offc";
-constexpr auto kArgAPOffset = "offap";
-constexpr auto kArgDotOffset = "offdot";
-constexpr auto kArgNrm2Offset = "offnrm2";
-constexpr auto kArgAsumOffset = "offasum";
-constexpr auto kArgImaxOffset = "offimax";
-constexpr auto kArgAlpha = "alpha";
-constexpr auto kArgBeta = "beta";
-
-// The tuner-specific arguments in string form
-constexpr auto kArgFraction = "fraction";
-
-// The client-specific arguments in string form
-constexpr auto kArgCompareclblas = "clblas";
-constexpr auto kArgComparecblas = "cblas";
-constexpr auto kArgStepSize = "step";
-constexpr auto kArgNumSteps = "num_steps";
-constexpr auto kArgNumRuns = "runs";
-
-// The client-specific arguments in string form
-constexpr auto kArgFullTest = "full_test";
-constexpr auto kArgVerbose = "verbose";
-
-// The common arguments in string form
-constexpr auto kArgPlatform = "platform";
-constexpr auto kArgDevice = "device";
-constexpr auto kArgPrecision = "precision";
-constexpr auto kArgHelp = "h";
-constexpr auto kArgQuiet = "q";
-constexpr auto kArgNoAbbreviations = "no_abbrv";
-
-// =================================================================================================
-
-// Returns a scalar with a default value
-template <typename T>
-T GetScalar();
-
-// Returns a scalar of value 1
-template <typename T>
-T ConstantOne();
-
-// =================================================================================================
-
-// Structure containing all possible arguments for test clients, including their default values
-template <typename T>
-struct Arguments {
- // Routine-specific arguments
- size_t m = 1;
- size_t n = 1;
- size_t k = 1;
- size_t ku = 1;
- size_t kl = 1;
- Layout layout = Layout::kRowMajor;
- Transpose a_transpose = Transpose::kNo;
- Transpose b_transpose = Transpose::kNo;
- Side side = Side::kLeft;
- Triangle triangle = Triangle::kUpper;
- Diagonal diagonal = Diagonal::kUnit;
- size_t x_inc = 1;
- size_t y_inc = 1;
- size_t x_offset = 0;
- size_t y_offset = 0;
- size_t a_ld = 1;
- size_t b_ld = 1;
- size_t c_ld = 1;
- size_t a_offset = 0;
- size_t b_offset = 0;
- size_t c_offset = 0;
- size_t ap_offset = 0;
- size_t dot_offset = 0;
- size_t nrm2_offset = 0;
- size_t asum_offset = 0;
- size_t imax_offset = 0;
- T alpha = ConstantOne<T>();
- T beta = ConstantOne<T>();
- size_t x_size = 1;
- size_t y_size = 1;
- size_t a_size = 1;
- size_t b_size = 1;
- size_t c_size = 1;
- size_t ap_size = 1;
- size_t scalar_size = 1;
- // Tuner-specific arguments
- double fraction = 1.0;
- // Client-specific arguments
- int compare_clblas = 1;
- int compare_cblas = 1;
- size_t step = 1;
- size_t num_steps = 0;
- size_t num_runs = 10;
- // Common arguments
- size_t platform_id = 0;
- size_t device_id = 0;
- Precision precision = Precision::kSingle;
- bool print_help = false;
- bool silent = false;
- bool no_abbrv = false;
-};
-
-// Structure containing all possible buffers for test clients
-template <typename T>
-struct Buffers {
- Buffer<T> x_vec;
- Buffer<T> y_vec;
- Buffer<T> a_mat;
- Buffer<T> b_mat;
- Buffer<T> c_mat;
- Buffer<T> ap_mat;
- Buffer<T> scalar;
-};
-
-// =================================================================================================
-
-// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
-// data-types such as the Layout and Transpose data-types.
-template <typename T>
-std::string ToString(T value);
-
-// =================================================================================================
-
-// Helper for the function "GetArgument"
-template <typename T>
-T ConvertArgument(const char* value);
-
-// Basic argument parser, matching patterns in the form of "-option value" and "--option value"
-template <typename T>
-T GetArgument(const int argc, char *argv[], std::string &help,
- const std::string &option, const T default_value);
-
-// Returns the precision only
-Precision GetPrecision(const int argc, char *argv[],
- const Precision default_precision = Precision::kSingle);
-
-// As in "GetArgument", but now only checks whether an argument is given or not
-bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option);
-
-// =================================================================================================
-
-// Helper function to check for errors in the status code
-constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
-
-// =================================================================================================
-
-// Returns a random number to be used as a seed
-unsigned int GetRandomSeed();
-
-// Test/example data lower and upper limit
-constexpr auto kTestDataLowerLimit = -2.0;
-constexpr auto kTestDataUpperLimit = 2.0;
-
-// Populates a vector with random data
-template <typename T>
-void PopulateVector(std::vector<T> &vector);
-
-// =================================================================================================
-
-// Conversion between half and single-precision
-std::vector<float> HalfToFloatBuffer(const std::vector<half>& source);
-void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source);
-
-// As above, but now for OpenCL data-types instead of std::vectors
-Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw);
-void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw);
-
-// =================================================================================================
-
-// Rounding functions
-size_t CeilDiv(const size_t x, const size_t y);
-size_t Ceil(const size_t x, const size_t y);
-
-// Returns whether or not 'a' is a multiple of 'b'
-bool IsMultiple(const size_t a, const size_t b);
-
-// =================================================================================================
-
-// Convert the precision enum into bytes, e.g. a double takes up 8 bytes
-size_t GetBytes(const Precision precision);
-
-// Convert the template argument into a precision value
-template <typename T>
-Precision PrecisionValue();
-
-// =================================================================================================
-
-// Returns false is this precision is not supported by the device
-template <typename T>
-bool PrecisionSupported(const Device &device);
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_UTILITIES_H_
-#endif