diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-06-18 20:20:13 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-06-18 20:20:13 +0200 |
commit | f726fbdc9fef937fbe32222f0e66aac8d7e2678c (patch) | |
tree | cb62cc877ea239052fb1882f7bf327aace3e7776 /include/internal | |
parent | bacb5d2bb2ea7b141034878090aca850db8f9d00 (diff) |
Moved all headers into the source tree, changed headers to .hpp extension
Diffstat (limited to 'include/internal')
62 files changed, 0 insertions, 5992 deletions
diff --git a/include/internal/buffer_test.h b/include/internal/buffer_test.h deleted file mode 100644 index 80f5243f..00000000 --- a/include/internal/buffer_test.h +++ /dev/null @@ -1,121 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are -// templated and thus header-only. -// -// ================================================================================================= - -#ifndef CLBLAST_BUFFER_TEST_H_ -#define CLBLAST_BUFFER_TEST_H_ - -#include "clblast.h" - -namespace clblast { -// ================================================================================================= - -// Tests matrix 'A' for validity -template <typename T> -StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer, - const size_t offset, const size_t ld) { - if (ld < one) { return StatusCode::kInvalidLeadDimA; } - try { - const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; } - } catch (...) { return StatusCode::kInvalidMatrixA; } - return StatusCode::kSuccess; -} - -// Tests matrix 'B' for validity -template <typename T> -StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer, - const size_t offset, const size_t ld) { - if (ld < one) { return StatusCode::kInvalidLeadDimB; } - try { - const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; } - } catch (...) { return StatusCode::kInvalidMatrixB; } - return StatusCode::kSuccess; -} - -// Tests matrix 'C' for validity -template <typename T> -StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer, - const size_t offset, const size_t ld) { - if (ld < one) { return StatusCode::kInvalidLeadDimC; } - try { - const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; } - } catch (...) { return StatusCode::kInvalidMatrixC; } - return StatusCode::kSuccess; -} - -// Tests matrix 'AP' for validity -template <typename T> -StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) { - try { - const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; } - } catch (...) { return StatusCode::kInvalidMatrixA; } - return StatusCode::kSuccess; -} - -// ================================================================================================= - -// Tests vector 'X' for validity -template <typename T> -StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset, - const size_t inc) { - if (inc == 0) { return StatusCode::kInvalidIncrementX; } - try { - const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; } - } catch (...) { return StatusCode::kInvalidVectorX; } - return StatusCode::kSuccess; -} - -// Tests vector 'Y' for validity -template <typename T> -StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset, - const size_t inc) { - if (inc == 0) { return StatusCode::kInvalidIncrementY; } - try { - const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; } - } catch (...) { return StatusCode::kInvalidVectorY; } - return StatusCode::kSuccess; -} - -// ================================================================================================= - -// Tests vector 'scalar' for validity -template <typename T> -StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) { - try { - const auto required_size = (n + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; } - } catch (...) { return StatusCode::kInvalidVectorScalar; } - return StatusCode::kSuccess; -} - -// Tests vector 'index' for validity -template <typename T> -StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) { - try { - const auto required_size = (n + offset) * sizeof(T); - if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; } - } catch (...) { return StatusCode::kInvalidVectorScalar; } - return StatusCode::kSuccess; -} - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_BUFFER_TEST_H_ -#endif diff --git a/include/internal/cache.h b/include/internal/cache.h deleted file mode 100644 index bc7e87d9..00000000 --- a/include/internal/cache.h +++ /dev/null @@ -1,98 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the caching functionality of compiled binaries and programs. -// -// ================================================================================================= - -#ifndef CLBLAST_CACHE_H_ -#define CLBLAST_CACHE_H_ - -#include <string> -#include <vector> -#include <mutex> - -#include "internal/utilities.h" - -namespace clblast { -// ================================================================================================= - -// The cache of compiled OpenCL binaries, along with some meta-data -struct BinaryCache { - std::string binary; - std::string device_name; - Precision precision; - std::string routine_name_; - - // Finds out whether the properties match - bool MatchInCache(const std::string &ref_device, const Precision &ref_precision, - const std::string &ref_routine) { - return (device_name == ref_device && - precision == ref_precision && - routine_name_ == ref_routine); - } -}; - -// The actual cache, implemented as a vector of the above data-type, and its mutex -static std::vector<BinaryCache> binary_cache_; -static std::mutex binary_cache_mutex_; - -// ================================================================================================= - -// The cache of compiled OpenCL programs, along with some meta-data -struct ProgramCache { - Program program; - ContextPointer context_ptr; - Precision precision; - std::string routine_name_; - - // Finds out whether the properties match - bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision, - const std::string &ref_routine) { - return (context_ptr == ref_context && - precision == ref_precision && - routine_name_ == ref_routine); - } -}; - -// The actual cache, implemented as a vector of the above data-type, and its mutex -static std::vector<ProgramCache> program_cache_; -static std::mutex program_cache_mutex_; - -// ================================================================================================= - -// Stores the compiled binary or program in the cache -void StoreBinaryToCache(const std::string &binary, const std::string &device_name, - const Precision &precision, const std::string &routine_name); -void StoreProgramToCache(const Program &program, const Context &context, - const Precision &precision, const std::string &routine_name); - -// Queries the cache and retrieves a matching binary or program. Assumes that the match is -// available, throws otherwise. -const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name); -const Program& GetProgramFromCache(const Context &context, const Precision &precision, - const std::string &routine_name); - -// Queries the cache to see whether or not the compiled kernel is already there -bool BinaryIsInCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name); -bool ProgramIsInCache(const Context &context, const Precision &precision, - const std::string &routine_name); - -// ================================================================================================= - -// Clears the cache of stored binaries -StatusCode CacheClearAll(); - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_CACHE_H_ -#endif diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h deleted file mode 100644 index b834d8b4..00000000 --- a/include/internal/clpp11.h +++ /dev/null @@ -1,695 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API -// calls. The main benefits are increased abstraction, automatic memory management, and portability. -// Portability here means that a similar header exists for CUDA with the same classes and -// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. -// -// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore -// contains the following header copyright notice: -// -// ================================================================================================= -// -// Copyright 2015 SURFsara -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// ================================================================================================= - -#ifndef CLBLAST_CLPP11_H_ -#define CLBLAST_CLPP11_H_ - -// C++ -#include <algorithm> // std::copy -#include <string> // std::string -#include <vector> // std::vector -#include <memory> // std::shared_ptr -#include <stdexcept> // std::runtime_error -#include <numeric> // std::accumulate - -// OpenCL -#if defined(__APPLE__) || defined(__MACOSX) - #include <OpenCL/opencl.h> -#else - #include <CL/opencl.h> -#endif - -namespace clblast { -// ================================================================================================= - -// Error occurred in the C++11 OpenCL header (this file) -inline void Error(const std::string &message) { - throw std::runtime_error("Internal OpenCL error: "+message); -} - -// Error occurred in OpenCL -inline void CheckError(const cl_int status) { - if (status != CL_SUCCESS) { - throw std::runtime_error("Internal OpenCL error: "+std::to_string(status)); - } -} - -// ================================================================================================= - -// C++11 version of 'cl_event' -class Event { - public: - - // Constructor based on the regular OpenCL data-type - explicit Event(const cl_event event): event_(event) { } - - // Regular constructor - explicit Event(): event_(nullptr) { } - - // Waits for completion of this event - void WaitForCompletion() const { - CheckError(clWaitForEvents(1, &event_)); - } - - // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on - // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation: - // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx - float GetElapsedTime() const { - WaitForCompletion(); - auto bytes = size_t{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); - auto time_start = size_t{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); - auto time_end = size_t{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); - return (time_end - time_start) * 1.0e-6f; - } - - // Accessor to the private data-member - cl_event& operator()() { return event_; } - cl_event* pointer() { return &event_; } - private: - cl_event event_; -}; - -// Pointer to an OpenCL event -using EventPointer = cl_event*; - -// ================================================================================================= - -// C++11 version of 'cl_platform_id' -class Platform { - public: - - // Constructor based on the regular OpenCL data-type - explicit Platform(const cl_platform_id platform): platform_(platform) { } - - // Initializes the platform - explicit Platform(const size_t platform_id) { - auto num_platforms = cl_uint{0}; - CheckError(clGetPlatformIDs(0, nullptr, &num_platforms)); - if (num_platforms == 0) { Error("no platforms found"); } - auto platforms = std::vector<cl_platform_id>(num_platforms); - CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr)); - if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); } - platform_ = platforms[platform_id]; - } - - // Returns the number of devices on this platform - size_t NumDevices() const { - auto result = cl_uint{0}; - CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result)); - return static_cast<size_t>(result); - } - - // Accessor to the private data-member - const cl_platform_id& operator()() const { return platform_; } - private: - cl_platform_id platform_; -}; - -// ================================================================================================= - -// C++11 version of 'cl_device_id' -class Device { - public: - - // Constructor based on the regular OpenCL data-type - explicit Device(const cl_device_id device): device_(device) { } - - // Initialize the device. Note that this constructor can throw exceptions! - explicit Device(const Platform &platform, const size_t device_id) { - auto num_devices = platform.NumDevices(); - if (num_devices == 0) { Error("no devices found"); } - auto devices = std::vector<cl_device_id>(num_devices); - CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices), - devices.data(), nullptr)); - if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); } - device_ = devices[device_id]; - } - - // Methods to retrieve device information - std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); } - std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); } - std::string Name() const { return GetInfoString(CL_DEVICE_NAME); } - std::string Type() const { - auto type = GetInfo<cl_device_type>(CL_DEVICE_TYPE); - switch(type) { - case CL_DEVICE_TYPE_CPU: return "CPU"; - case CL_DEVICE_TYPE_GPU: return "GPU"; - case CL_DEVICE_TYPE_ACCELERATOR: return "accelerator"; - default: return "default"; - } - } - size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); } - size_t MaxWorkItemDimensions() const { - return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS); - } - std::vector<size_t> MaxWorkItemSizes() const { - return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES); - } - size_t LocalMemSize() const { - return static_cast<size_t>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE)); - } - std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); } - size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); } - size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); } - size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); } - size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); } - size_t MemoryClock() const { return 0; } // Not exposed in OpenCL - size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL - - // Configuration-validity checks - bool IsLocalMemoryValid(const size_t local_mem_usage) const { - return (local_mem_usage <= LocalMemSize()); - } - bool IsThreadConfigValid(const std::vector<size_t> &local) const { - auto local_size = size_t{1}; - for (const auto &item: local) { local_size *= item; } - for (auto i=size_t{0}; i<local.size(); ++i) { - if (local[i] > MaxWorkItemSizes()[i]) { return false; } - } - if (local_size > MaxWorkGroupSize()) { return false; } - if (local.size() > MaxWorkItemDimensions()) { return false; } - return true; - } - - // Query for a specific type of device or brand - bool IsCPU() const { return Type() == "CPU"; } - bool IsGPU() const { return Type() == "GPU"; } - bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; } - bool IsARM() const { return Vendor() == "ARM"; } - - // Accessor to the private data-member - const cl_device_id& operator()() const { return device_; } - private: - cl_device_id device_; - - // Private helper functions - template <typename T> - T GetInfo(const cl_device_info info) const { - auto bytes = size_t{0}; - CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); - auto result = T(0); - CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr)); - return result; - } - size_t GetInfo(const cl_device_info info) const { - auto bytes = size_t{0}; - CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); - auto result = cl_uint(0); - CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr)); - return static_cast<size_t>(result); - } - template <typename T> - std::vector<T> GetInfoVector(const cl_device_info info) const { - auto bytes = size_t{0}; - CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); - auto result = std::vector<T>(bytes/sizeof(T)); - CheckError(clGetDeviceInfo(device_, info, bytes, result.data(), nullptr)); - return result; - } - std::string GetInfoString(const cl_device_info info) const { - auto bytes = size_t{0}; - CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); - auto result = std::string{}; - result.resize(bytes); - CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr)); - return std::string{result.c_str()}; // Removes any trailing '\0'-characters - } -}; - -// ================================================================================================= - -// C++11 version of 'cl_context' -class Context { - public: - - // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere - explicit Context(const cl_context context): - context_(new cl_context) { - *context_ = context; - } - - // Regular constructor with memory management - explicit Context(const Device &device): - context_(new cl_context, [](cl_context* c) { CheckError(clReleaseContext(*c)); delete c; }) { - auto status = CL_SUCCESS; - const cl_device_id dev = device(); - *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status); - CheckError(status); - } - - // Accessor to the private data-member - const cl_context& operator()() const { return *context_; } - cl_context* pointer() const { return &(*context_); } - private: - std::shared_ptr<cl_context> context_; -}; - -// Pointer to an OpenCL context -using ContextPointer = cl_context*; - -// ================================================================================================= - -// Enumeration of build statuses of the run-time compilation process -enum class BuildStatus { kSuccess, kError, kInvalid }; - -// C++11 version of 'cl_program'. Additionally holds the program's source code. -class Program { - public: - // Note that there is no constructor based on the regular OpenCL data-type because of extra state - - // Source-based constructor with memory management - explicit Program(const Context &context, std::string source): - program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), - length_(source.length()), - source_(std::move(source)), - source_ptr_(&source_[0]) { - auto status = CL_SUCCESS; - *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status); - CheckError(status); - } - - // Binary-based constructor with memory management - explicit Program(const Device &device, const Context &context, const std::string& binary): - program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), - length_(binary.length()), - source_(binary), - source_ptr_(&source_[0]) { - auto status1 = CL_SUCCESS; - auto status2 = CL_SUCCESS; - const cl_device_id dev = device(); - *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_, - reinterpret_cast<const unsigned char**>(&source_ptr_), - &status1, &status2); - CheckError(status1); - CheckError(status2); - } - - // Compiles the device program and returns whether or not there where any warnings/errors - BuildStatus Build(const Device &device, std::vector<std::string> &options) { - auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "}); - const cl_device_id dev = device(); - auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr); - if (status == CL_BUILD_PROGRAM_FAILURE) { - return BuildStatus::kError; - } - else if (status == CL_INVALID_BINARY) { - return BuildStatus::kInvalid; - } - else { - CheckError(status); - return BuildStatus::kSuccess; - } - } - - // Retrieves the warning/error message from the compiler (if any) - std::string GetBuildInfo(const Device &device) const { - auto bytes = size_t{0}; - auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG}; - CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes)); - auto result = std::string{}; - result.resize(bytes); - CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr)); - return result; - } - - // Retrieves a binary or an intermediate representation of the compiled program - std::string GetIR() const { - auto bytes = size_t{0}; - CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr)); - auto result = std::string{}; - result.resize(bytes); - auto result_ptr = result.data(); - CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr)); - return result; - } - - // Accessor to the private data-member - const cl_program& operator()() const { return *program_; } - private: - std::shared_ptr<cl_program> program_; - size_t length_; - std::string source_; // Note: the source can also be a binary or IR - const char* source_ptr_; -}; - -// ================================================================================================= - -// C++11 version of 'cl_command_queue' -class Queue { - public: - - // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere - explicit Queue(const cl_command_queue queue): - queue_(new cl_command_queue) { - *queue_ = queue; - } - - // Regular constructor with memory management - explicit Queue(const Context &context, const Device &device): - queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s)); - delete s; }) { - auto status = CL_SUCCESS; - #ifdef CL_VERSION_2_0 - cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; - *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status); - #else - *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); - #endif - CheckError(status); - } - - // Synchronizes the queue - void Finish(Event &) const { - Finish(); - } - void Finish() const { - CheckError(clFinish(*queue_)); - } - - // Retrieves the corresponding context or device - Context GetContext() const { - auto bytes = size_t{0}; - CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes)); - cl_context result; - CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr)); - return Context(result); - } - Device GetDevice() const { - auto bytes = size_t{0}; - CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes)); - cl_device_id result; - CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr)); - return Device(result); - } - - // Accessor to the private data-member - const cl_command_queue& operator()() const { return *queue_; } - private: - std::shared_ptr<cl_command_queue> queue_; -}; - -// ================================================================================================= - -// C++11 version of host memory -template <typename T> -class BufferHost { - public: - - // Regular constructor with memory management - explicit BufferHost(const Context &, const size_t size): - buffer_(new std::vector<T>(size)) { - } - - // Retrieves the actual allocated size in bytes - size_t GetSize() const { - return buffer_->size()*sizeof(T); - } - - // Compatibility with std::vector - size_t size() const { return buffer_->size(); } - T* begin() { return &(*buffer_)[0]; } - T* end() { return &(*buffer_)[buffer_->size()-1]; } - T& operator[](const size_t i) { return (*buffer_)[i]; } - T* data() { return buffer_->data(); } - const T* data() const { return buffer_->data(); } - - private: - std::shared_ptr<std::vector<T>> buffer_; -}; - -// ================================================================================================= - -// Enumeration of buffer access types -enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }; - -// C++11 version of 'cl_mem' -template <typename T> -class Buffer { - public: - - // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere - explicit Buffer(const cl_mem buffer): - buffer_(new cl_mem), - access_(BufferAccess::kNotOwned) { - *buffer_ = buffer; - } - - // Regular constructor with memory management. If this class does not own the buffer object, then - // the memory will not be freed automatically afterwards. - explicit Buffer(const Context &context, const BufferAccess access, const size_t size): - buffer_(new cl_mem, [access](cl_mem* m) { - if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); } - delete m; - }), - access_(access) { - auto flags = cl_mem_flags{CL_MEM_READ_WRITE}; - if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; } - if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; } - auto status = CL_SUCCESS; - *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status); - CheckError(status); - } - - // As above, but now with read/write access as a default - explicit Buffer(const Context &context, const size_t size): - Buffer<T>(context, BufferAccess::kReadWrite, size) { - } - - // Constructs a new buffer based on an existing host-container - template <typename Iterator> - explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end): - Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) { - auto size = static_cast<size_t>(end - start); - auto pointer = &*start; - CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0, - nullptr, nullptr)); - queue.Finish(); - } - - // Copies from device to host: reading the device buffer a-synchronously - void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { - if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); } - CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T), - host, 0, nullptr, nullptr)); - } - void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host, - const size_t offset = 0) const { - if (host.size() < size) { Error("target host buffer is too small"); } - ReadAsync(queue, size, host.data(), offset); - } - void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host, - const size_t offset = 0) const { - if (host.size() < size) { Error("target host buffer is too small"); } - ReadAsync(queue, size, host.data(), offset); - } - - // Copies from device to host: reading the device buffer - void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { - ReadAsync(queue, size, host, offset); - queue.Finish(); - } - void Read(const Queue &queue, const size_t size, std::vector<T> &host, - const size_t offset = 0) const { - Read(queue, size, host.data(), offset); - } - void Read(const Queue &queue, const size_t size, BufferHost<T> &host, - const size_t offset = 0) const { - Read(queue, size, host.data(), offset); - } - - // Copies from host to device: writing the device buffer a-synchronously - void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { - if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); } - if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); } - CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T), - host, 0, nullptr, nullptr)); - } - void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host, - const size_t offset = 0) { - WriteAsync(queue, size, host.data(), offset); - } - void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host, - const size_t offset = 0) { - WriteAsync(queue, size, host.data(), offset); - } - - // Copies from host to device: writing the device buffer - void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { - WriteAsync(queue, size, host, offset); - queue.Finish(); - } - void Write(const Queue &queue, const size_t size, const std::vector<T> &host, - const size_t offset = 0) { - Write(queue, size, host.data(), offset); - } - void Write(const Queue &queue, const size_t size, const BufferHost<T> &host, - const size_t offset = 0) { - Write(queue, size, host.data(), offset); - } - - // Copies the contents of this buffer into another device buffer - void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const { - CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0, - nullptr, nullptr)); - } - void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const { - CopyToAsync(queue, size, destination); - queue.Finish(); - } - - // Retrieves the actual allocated size in bytes - size_t GetSize() const { - auto bytes = size_t{0}; - CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes)); - auto result = size_t{0}; - CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr)); - return result; - } - - // Accessor to the private data-member - const cl_mem& operator()() const { return *buffer_; } - private: - std::shared_ptr<cl_mem> buffer_; - const BufferAccess access_; -}; - -// ================================================================================================= - -// C++11 version of 'cl_kernel' -class Kernel { - public: - - // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere - explicit Kernel(const cl_kernel kernel): - kernel_(new cl_kernel) { - *kernel_ = kernel; - } - - // Regular constructor with memory management - explicit Kernel(const Program &program, const std::string &name): - kernel_(new cl_kernel, [](cl_kernel* k) { CheckError(clReleaseKernel(*k)); delete k; }) { - auto status = CL_SUCCESS; - *kernel_ = clCreateKernel(program(), name.c_str(), &status); - CheckError(status); - } - - // Sets a kernel argument at the indicated position - template <typename T> - void SetArgument(const size_t index, const T &value) { - CheckError(clSetKernelArg(*kernel_, static_cast<cl_uint>(index), sizeof(T), &value)); - } - template <typename T> - void SetArgument(const size_t index, Buffer<T> &value) { - SetArgument(index, value()); - } - - // Sets all arguments in one go using parameter packs. Note that this overwrites previously set - // arguments using 'SetArgument' or 'SetArguments'. - template <typename... Args> - void SetArguments(Args&... args) { - SetArgumentsRecursive(0, args...); - } - - // Retrieves the amount of local memory used per work-group for this kernel - size_t LocalMemUsage(const Device &device) const { - auto bytes = size_t{0}; - auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE}; - CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes)); - auto result = size_t{0}; - CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr)); - return result; - } - - // Launches a kernel onto the specified queue - void Launch(const Queue &queue, const std::vector<size_t> &global, - const std::vector<size_t> &local, EventPointer event) { - CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()), - nullptr, global.data(), local.data(), - 0, nullptr, event)); - } - - // As above, but with an event waiting list - void Launch(const Queue &queue, const std::vector<size_t> &global, - const std::vector<size_t> &local, EventPointer event, - std::vector<Event>& waitForEvents) { - if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); } - - // Builds a plain version of the events waiting list - auto waitForEventsPlain = std::vector<cl_event>(); - for (auto &waitEvent : waitForEvents) { - waitForEventsPlain.push_back(waitEvent()); - } - - // Launches the kernel while waiting for other events - CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()), - nullptr, global.data(), local.data(), - static_cast<cl_uint>(waitForEventsPlain.size()), - waitForEventsPlain.data(), - event)); - } - - // As above, but with the default local workgroup size - void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) { - CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()), - nullptr, global.data(), nullptr, - 0, nullptr, event)); - } - - // Accessor to the private data-member - const cl_kernel& operator()() const { return *kernel_; } - private: - std::shared_ptr<cl_kernel> kernel_; - - // Internal implementation for the recursive SetArguments function. - template <typename T> - void SetArgumentsRecursive(const size_t index, T &first) { - SetArgument(index, first); - } - template <typename T, typename... Args> - void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { - SetArgument(index, first); - SetArgumentsRecursive(index+1, args...); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_CLPP11_H_ -#endif diff --git a/include/internal/database.h b/include/internal/database.h deleted file mode 100644 index f93eaa22..00000000 --- a/include/internal/database.h +++ /dev/null @@ -1,104 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Database class, providing a static variable holding the actual database -// information. The class also provides utility functions to search the database and to access a -// found entry by parameter-key. The database itself is filled in the corresponding source-file and -// partially also by the database/xxxxx.h files, in which kernel-specific parameters are found. -// -// ================================================================================================= - -#ifndef CLBLAST_DATABASE_H_ -#define CLBLAST_DATABASE_H_ - -#include <string> -#include <vector> -#include <unordered_map> - -#include "internal/utilities.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -class Database { - public: - - // Type alias for the database parameters - using Parameters = std::unordered_map<std::string,size_t>; - - // Structures for content inside the database - struct DatabaseDevice { - const std::string name; - const Parameters parameters; - }; - struct DatabaseVendor { - const std::string type; - const std::string name; - const std::vector<DatabaseDevice> devices; - }; - struct DatabaseEntry { - const std::string kernel; - const Precision precision; - const std::vector<DatabaseVendor> vendors; - }; - - // The OpenCL device types - static constexpr auto kDeviceTypeCPU = "CPU"; - static constexpr auto kDeviceTypeGPU = "GPU"; - static constexpr auto kDeviceTypeAccelerator = "accelerator"; - static constexpr auto kDeviceTypeAll = "default"; - - // The OpenCL device vendors - static constexpr auto kDeviceVendorAll = "default"; - - // Alternative names for some OpenCL vendors - const std::unordered_map<std::string,std::string> kVendorNames { - {"Intel(R) Corporation", "Intel"}, - {"GenuineIntel", "Intel"}, - {"Advanced Micro Devices, Inc.", "AMD"}, - {"NVIDIA Corporation", "NVIDIA"}, - }; - - // The database consists of separate database entries, stored together in a vector - static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble; - static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble; - static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble; - static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble; - static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble; - static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble; - static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble; - static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble; - static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble; - static const std::vector<DatabaseEntry> database; - - // The constructor - explicit Database(const Queue &queue, const std::vector<std::string> &routines, - const Precision precision); - - // Accessor of values by key - size_t operator[](const std::string key) const { return parameters_.find(key)->second; } - - // Obtain a list of OpenCL pre-processor defines based on the parameters - std::string GetDefines() const; - - private: - Parameters Search(const std::string &this_kernel, const std::string &this_type, - const std::string &this_vendor, const std::string &this_device, - const Precision this_precision) const; - - // Found parameters suitable for this device/kernel - Parameters parameters_; -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_DATABASE_H_ -#endif diff --git a/include/internal/database/copy.h b/include/internal/database/copy.h deleted file mode 100644 index 201e8b8a..00000000 --- a/include/internal/database/copy.h +++ /dev/null @@ -1,262 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Database generator <database.py> -// -// This file populates the database with best-found tuning parameters for the 'Copy' kernels. -// -// ================================================================================================= - -namespace clblast { -// ================================================================================================= - -const Database::DatabaseEntry Database::CopyHalf = { - "Copy", Precision::kHalf, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::CopySingle = { - "Copy", Precision::kSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, - { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, - { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, - { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, - { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, - { "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, - { "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } }, - { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, - { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::CopyComplexSingle = { - "Copy", Precision::kComplexSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, - { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, - { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, - { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, - { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } }, - { "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::CopyDouble = { - "Copy", Precision::kDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, - { "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::CopyComplexDouble = { - "Copy", Precision::kComplexDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } }, - { "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, - { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= -} // namespace clblast diff --git a/include/internal/database/pad.h b/include/internal/database/pad.h deleted file mode 100644 index cc703dd6..00000000 --- a/include/internal/database/pad.h +++ /dev/null @@ -1,270 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Database generator <database.py> -// -// This file populates the database with best-found tuning parameters for the 'Pad' kernels. -// -// ================================================================================================= - -namespace clblast { -// ================================================================================================= - -const Database::DatabaseEntry Database::PadHalf = { - "Pad", Precision::kHalf, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::PadSingle = { - "Pad", Precision::kSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, - { "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, - { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::PadComplexSingle = { - "Pad", Precision::kComplexSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, - { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, - { "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::PadDouble = { - "Pad", Precision::kDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX 680", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "GeForce GTX 750 Ti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX 980", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::PadComplexDouble = { - "Pad", Precision::kComplexDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - } - }, - } -}; - -// ================================================================================================= -} // namespace clblast diff --git a/include/internal/database/padtranspose.h b/include/internal/database/padtranspose.h deleted file mode 100644 index f3b1f262..00000000 --- a/include/internal/database/padtranspose.h +++ /dev/null @@ -1,270 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Database generator <database.py> -// -// This file populates the database with best-found tuning parameters for the 'Padtranspose' kernels. -// -// ================================================================================================= - -namespace clblast { -// ================================================================================================= - -const Database::DatabaseEntry Database::PadtransposeHalf = { - "Padtranspose", Precision::kHalf, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::PadtransposeSingle = { - "Padtranspose", Precision::kSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, - { "Hawaii", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, - { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, - { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, - { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, - { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, - { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::PadtransposeComplexSingle = { - "Padtranspose", Precision::kComplexSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, - { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, - { "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, - { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::PadtransposeDouble = { - "Padtranspose", Precision::kDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, - { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, - { "GeForce GTX 980", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, - { "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, - { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::PadtransposeComplexDouble = { - "Padtranspose", Precision::kComplexDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, - { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= -} // namespace clblast diff --git a/include/internal/database/transpose.h b/include/internal/database/transpose.h deleted file mode 100644 index 0c893dae..00000000 --- a/include/internal/database/transpose.h +++ /dev/null @@ -1,258 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Database generator <database.py> -// -// This file populates the database with best-found tuning parameters for the 'Transpose' kernels. -// -// ================================================================================================= - -namespace clblast { -// ================================================================================================= - -const Database::DatabaseEntry Database::TransposeHalf = { - "Transpose", Precision::kHalf, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::TransposeSingle = { - "Transpose", Precision::kSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, - { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, - { "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::TransposeComplexSingle = { - "Transpose", Precision::kComplexSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::TransposeDouble = { - "Transpose", Precision::kDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::TransposeComplexDouble = { - "Transpose", Precision::kComplexDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "GeForce GTX 980", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= -} // namespace clblast diff --git a/include/internal/database/xaxpy.h b/include/internal/database/xaxpy.h deleted file mode 100644 index 6e6719e8..00000000 --- a/include/internal/database/xaxpy.h +++ /dev/null @@ -1,270 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Database generator <database.py> -// -// This file populates the database with best-found tuning parameters for the 'Xaxpy' kernels. -// -// ================================================================================================= - -namespace clblast { -// ================================================================================================= - -const Database::DatabaseEntry Database::XaxpyHalf = { - "Xaxpy", Precision::kHalf, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, - { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XaxpySingle = { - "Xaxpy", Precision::kSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",2} } }, - { "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, - { "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, - { "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",1}, {"WGS",512}, {"WPT",2} } }, - { "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",1024}, {"WPT",2} } }, - { "default", { {"VW",2}, {"WGS",1024}, {"WPT",2} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 480", { {"VW",4}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 680", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, - { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, - { "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, - { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } }, - { "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XaxpyComplexSingle = { - "Xaxpy", Precision::kComplexSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } }, - { "Hawaii", { {"VW",1}, {"WGS",128}, {"WPT",2} } }, - { "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS",1024}, {"WPT",2} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",2}, {"WGS",512}, {"WPT",1} } }, - { "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, - { "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } }, - { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, - { "GeForce GTX 480", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, - { "GeForce GTX 680", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, - { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, - { "GeForce GTX 980", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, - { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, - { "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XaxpyDouble = { - "Xaxpy", Precision::kDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, - { "Hawaii", { {"VW",1}, {"WGS",64}, {"WPT",2} } }, - { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"VW",2}, {"WGS",128}, {"WPT",2} } }, - { "default", { {"VW",2}, {"WGS",128}, {"WPT",2} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",64}, {"WPT",1} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",8}, {"WGS",2048}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",512}, {"WPT",1} } }, - { "default", { {"VW",2}, {"WGS",512}, {"WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 480", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, - { "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } }, - { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, - { "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, - { "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XaxpyComplexDouble = { - "Xaxpy", Precision::kComplexDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"VW",1}, {"WGS",64}, {"WPT",8} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",8} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",8}, {"WGS",128}, {"WPT",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } }, - { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, - { "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } }, - { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, - { "Tesla K20m", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "Tesla K40m", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= -} // namespace clblast diff --git a/include/internal/database/xdot.h b/include/internal/database/xdot.h deleted file mode 100644 index d09d8c62..00000000 --- a/include/internal/database/xdot.h +++ /dev/null @@ -1,200 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Database generator <database.py> -// -// This file populates the database with best-found tuning parameters for the 'Xdot' kernels. -// -// ================================================================================================= - -namespace clblast { -// ================================================================================================= - -const Database::DatabaseEntry Database::XdotHalf = { - "Xdot", Precision::kHalf, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } }, - { "default", { {"WGS1",32}, {"WGS2",32} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",32}, {"WGS2",32} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XdotSingle = { - "Xdot", Precision::kSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } }, - { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, - { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, - { "Tahiti", { {"WGS1",128}, {"WGS2",32} } }, - { "default", { {"WGS1",128}, {"WGS2",32} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } }, - { "default", { {"WGS1",1024}, {"WGS2",32} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",32} } }, - { "Iris Pro", { {"WGS1",512}, {"WGS2",64} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",128}, {"WGS2",32} } }, - { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, - { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } }, - { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } }, - { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, - { "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } }, - { "default", { {"WGS1",128}, {"WGS2",32} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WGS2",32} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XdotComplexSingle = { - "Xdot", Precision::kComplexSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, - { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, - { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, - { "Tahiti", { {"WGS1",64}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } }, - { "default", { {"WGS1",1024}, {"WGS2",32} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } }, - { "Iris Pro", { {"WGS1",32}, {"WGS2",32} } }, - { "default", { {"WGS1",32}, {"WGS2",32} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",64}, {"WGS2",32} } }, - { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, - { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } }, - { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } }, - { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, - { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",32}, {"WGS2",32} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XdotDouble = { - "Xdot", Precision::kDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } }, - { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, - { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, - { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",64} } }, - { "default", { {"WGS1",512}, {"WGS2",64} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",128}, {"WGS2",32} } }, - { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, - { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } }, - { "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } }, - { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, - { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, - { "default", { {"WGS1",128}, {"WGS2",32} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WGS2",32} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XdotComplexDouble = { - "Xdot", Precision::kComplexDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, - { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, - { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, - { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } }, - { "default", { {"WGS1",1024}, {"WGS2",32} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",64}, {"WGS2",32} } }, - { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, - { "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } }, - { "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } }, - { "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } }, - { "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WGS2",32} } }, - } - }, - } -}; - -// ================================================================================================= -} // namespace clblast diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h deleted file mode 100644 index f35d2c88..00000000 --- a/include/internal/database/xgemm.h +++ /dev/null @@ -1,263 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Database generator <database.py> -// -// This file populates the database with best-found tuning parameters for the 'Xgemm' kernels. -// -// ================================================================================================= - -namespace clblast { -// ================================================================================================= - -const Database::DatabaseEntry Database::XgemmHalf = { - "Xgemm", Precision::kHalf, { - { // Default - kDeviceTypeAll, "default", { - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XgemmSingle = { - "Xgemm", Precision::kSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } }, - { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, - { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, - { "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, - { "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, - { "default", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, - { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, - { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, - { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } }, - { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } }, - { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, - { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } }, - { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, - { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XgemmComplexSingle = { - "Xgemm", Precision::kComplexSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, - { "Hawaii", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, - { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, - { "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, - { "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, - { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - { "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, - { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, - { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - { "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XgemmDouble = { - "Xgemm", Precision::kDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, - { "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - { "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, - { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, - { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, - { "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, - { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, - { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XgemmComplexDouble = { - "Xgemm", Precision::kComplexDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, - { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, - { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, - { "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - } - }, - } -}; - -// ================================================================================================= -} // namespace clblast diff --git a/include/internal/database/xgemv.h b/include/internal/database/xgemv.h deleted file mode 100644 index 6b76c8ac..00000000 --- a/include/internal/database/xgemv.h +++ /dev/null @@ -1,231 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Database generator <database.py> -// -// This file populates the database with best-found tuning parameters for the 'Xgemv' kernels. -// -// ================================================================================================= - -namespace clblast { -// ================================================================================================= - -const Database::DatabaseEntry Database::XgemvHalf = { - "Xgemv", Precision::kHalf, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XgemvSingle = { - "Xgemv", Precision::kSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",4} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } }, - { "Iris", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } }, - { "Iris Pro", { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",256}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } }, - { "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } }, - { "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - { "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "Tesla K20m", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - { "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XgemvComplexSingle = { - "Xgemv", Precision::kComplexSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Pitcairn", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } }, - { "Iris", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Iris Pro", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, - { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XgemvDouble = { - "Xgemv", Precision::kDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",2} } }, - { "default", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } }, - { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } }, - { "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - { "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "Tesla K20m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XgemvComplexDouble = { - "Xgemv", Precision::kComplexDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - } - }, - } -}; - -// ================================================================================================= -} // namespace clblast diff --git a/include/internal/database/xger.h b/include/internal/database/xger.h deleted file mode 100644 index f2e0a36f..00000000 --- a/include/internal/database/xger.h +++ /dev/null @@ -1,220 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Database generator <database.py> -// -// This file populates the database with best-found tuning parameters for the 'Xger' kernels. -// -// ================================================================================================= - -namespace clblast { -// ================================================================================================= - -const Database::DatabaseEntry Database::XgerHalf = { - "Xger", Precision::kHalf, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, - { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XgerSingle = { - "Xger", Precision::kSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, - { "Hawaii", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, - { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, - { "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, - { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } }, - { "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, - { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } }, - { "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } }, - { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",4} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, - { "GeForce GTX 480", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, - { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, - { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XgerComplexSingle = { - "Xger", Precision::kComplexSingle, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, - { "Hawaii", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, - { "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, - { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, - { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } }, - { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, - } - }, - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } }, - { "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } }, - { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } }, - { "GeForce GTX 480", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } }, - { "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, - { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, - { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XgerDouble = { - "Xger", Precision::kDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, - { "Hawaii", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, - { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, - { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, - { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, - { "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } }, - { "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } }, - { "GeForce GTX 480", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, - { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } }, - { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, - { "default", { {"WGS1",16}, {"WGS2",4}, {"WPT",2} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= - -const Database::DatabaseEntry Database::XgerComplexDouble = { - "Xger", Precision::kComplexDouble, { - { // AMD GPUs - kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, - { "Hawaii", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } }, - { "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, - { "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, - { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } }, - { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } }, - } - }, - { // Intel CPUs - kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, - { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, - { "GeForce GTX 480", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } }, - { "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } }, - { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } }, - } - }, - } -}; - -// ================================================================================================= -} // namespace clblast diff --git a/include/internal/public_api.h b/include/internal/public_api.h deleted file mode 100644 index d0732297..00000000 --- a/include/internal/public_api.h +++ /dev/null @@ -1,34 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file provides macro's to define the public API. This is needed when building a Windows DLL. -// Note: this is only used for the C++ interface, the C interface has its own definition included in -// the header file itself. -// -// ================================================================================================= - -#ifndef CLBLAST_PUBLIC_API_H_ -#define CLBLAST_PUBLIC_API_H_ - -namespace clblast { -// ================================================================================================= - -// Exports library functions under Windows when building a DLL. See also: -// https://msdn.microsoft.com/en-us/library/a90k134d.aspx -#ifdef _WIN32 - #define PUBLIC_API __declspec(dllexport) -#else - #define PUBLIC_API -#endif - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_PUBLIC_API_H_ -#endif diff --git a/include/internal/routine.h b/include/internal/routine.h deleted file mode 100644 index a6a59d77..00000000 --- a/include/internal/routine.h +++ /dev/null @@ -1,68 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements all the basic functionality for the BLAS routines. This class serves as a -// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as -// compiling the OpenCL kernel, connecting to the database, etc. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINE_H_ -#define CLBLAST_ROUTINE_H_ - -#include <string> -#include <vector> - -#include "internal/cache.h" -#include "internal/utilities.h" -#include "internal/database.h" -#include "internal/buffer_test.h" -#include "internal/routines/common.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -class Routine { - public: - - // Base class constructor - explicit Routine(Queue &queue, EventPointer event, const std::string &name, - const std::vector<std::string> &routines, const Precision precision); - - // Set-up phase of the kernel - StatusCode SetUp(); - - protected: - - // Non-static variable for the precision - const Precision precision_; - - // The routine's name and its kernel-source in string form - const std::string routine_name_; - std::string source_string_; - - // The OpenCL objects, accessible only from derived classes - Queue queue_; - EventPointer event_; - const Context context_; - const Device device_; - - // OpenCL device properties - const std::string device_name_; - - // Connection to the database for all the device-specific parameters - const Database db_; -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINE_H_ -#endif diff --git a/include/internal/routines/common.h b/include/internal/routines/common.h deleted file mode 100644 index 308785bd..00000000 --- a/include/internal/routines/common.h +++ /dev/null @@ -1,173 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file contains all the interfaces to common kernels, such as copying, padding, and -// transposing a matrix. These functions are templated and thus header-only. This file also contains -// other common functions to routines, such as a function to launch a kernel. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_COMMON_H_ -#define CLBLAST_ROUTINES_COMMON_H_ - -#include <string> -#include <vector> - -#include "clblast.h" -#include "internal/clpp11.h" -#include "internal/database.h" - -namespace clblast { -// ================================================================================================= - -// Enqueues a kernel, waits for completion, and checks for errors -StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, - std::vector<size_t> global, const std::vector<size_t> &local, - EventPointer event, std::vector<Event>& waitForEvents); - -// As above, but without an event waiting list -StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, - std::vector<size_t> global, const std::vector<size_t> &local, - EventPointer event); - -// ================================================================================================= - -// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able -// to write to symmetric and triangular matrices through optional arguments. -template <typename T> -StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context, - const Database &db, - EventPointer event, std::vector<Event>& waitForEvents, - const size_t src_one, const size_t src_two, - const size_t src_ld, const size_t src_offset, - const Buffer<T> &src, - const size_t dest_one, const size_t dest_two, - const size_t dest_ld, const size_t dest_offset, - const Buffer<T> &dest, - const T alpha, - const Program &program, const bool do_pad, - const bool do_transpose, const bool do_conjugate, - const bool upper = false, const bool lower = false, - const bool diagonal_imag_zero = false) { - - // Determines whether or not the fast-version could potentially be used - auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && - (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && - (upper == false) && (lower == false) && (diagonal_imag_zero == false); - - // Determines the right kernel - auto kernel_name = std::string{}; - if (do_transpose) { - if (use_fast_kernel && - IsMultiple(src_ld, db["TRA_WPT"]) && - IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) && - IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) { - kernel_name = "TransposeMatrixFast"; - } - else { - use_fast_kernel = false; - kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix"; - } - } - else { - if (use_fast_kernel && - IsMultiple(src_ld, db["COPY_VW"]) && - IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) && - IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) { - kernel_name = "CopyMatrixFast"; - } - else { - use_fast_kernel = false; - kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix"; - } - } - - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer<T>(context, 1); - alpha_buffer.Write(queue, 1, &alpha); - - // Retrieves the kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(src_ld)); - kernel.SetArgument(1, src()); - kernel.SetArgument(2, dest()); - kernel.SetArgument(3, alpha_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(src_one)); - kernel.SetArgument(1, static_cast<int>(src_two)); - kernel.SetArgument(2, static_cast<int>(src_ld)); - kernel.SetArgument(3, static_cast<int>(src_offset)); - kernel.SetArgument(4, src()); - kernel.SetArgument(5, static_cast<int>(dest_one)); - kernel.SetArgument(6, static_cast<int>(dest_two)); - kernel.SetArgument(7, static_cast<int>(dest_ld)); - kernel.SetArgument(8, static_cast<int>(dest_offset)); - kernel.SetArgument(9, dest()); - kernel.SetArgument(10, alpha_buffer()); - if (do_pad) { - kernel.SetArgument(11, static_cast<int>(do_conjugate)); - } - else { - kernel.SetArgument(11, static_cast<int>(upper)); - kernel.SetArgument(12, static_cast<int>(lower)); - kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero)); - } - } - - // Launches the kernel and returns the error code. Uses global and local thread sizes based on - // parameters in the database. - if (do_transpose) { - if (use_fast_kernel) { - const auto global = std::vector<size_t>{ - dest_one / db["TRA_WPT"], - dest_two / db["TRA_WPT"] - }; - const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]}; - return RunKernel(kernel, queue, device, global, local, event, waitForEvents); - } - else { - const auto global = std::vector<size_t>{ - Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), - Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]) - }; - const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]}; - return RunKernel(kernel, queue, device, global, local, event, waitForEvents); - } - } - else { - if (use_fast_kernel) { - const auto global = std::vector<size_t>{ - dest_one / db["COPY_VW"], - dest_two / db["COPY_WPT"] - }; - const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]}; - return RunKernel(kernel, queue, device, global, local, event, waitForEvents); - } - else { - const auto global = std::vector<size_t>{ - Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), - Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]) - }; - const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]}; - return RunKernel(kernel, queue, device, global, local, event, waitForEvents); - } - } - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_COMMON_H_ -#endif diff --git a/include/internal/routines/level1/xamax.h b/include/internal/routines/level1/xamax.h deleted file mode 100644 index 42f8f67c..00000000 --- a/include/internal/routines/level1/xamax.h +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xamax routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XAMAX_H_ -#define CLBLAST_ROUTINES_XAMAX_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xamax: public Routine { - public: - - // Constructor - Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX"); - - // Templated-precision implementation of the routine - StatusCode DoAmax(const size_t n, - const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XAMAX_H_ -#endif diff --git a/include/internal/routines/level1/xasum.h b/include/internal/routines/level1/xasum.h deleted file mode 100644 index 9d93a6f4..00000000 --- a/include/internal/routines/level1/xasum.h +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xasum routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XASUM_H_ -#define CLBLAST_ROUTINES_XASUM_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xasum: public Routine { - public: - - // Constructor - Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM"); - - // Templated-precision implementation of the routine - StatusCode DoAsum(const size_t n, - const Buffer<T> &asum_buffer, const size_t asum_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XASUM_H_ -#endif diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h deleted file mode 100644 index 4c8d2c1f..00000000 --- a/include/internal/routines/level1/xaxpy.h +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xaxpy routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XAXPY_H_ -#define CLBLAST_ROUTINES_XAXPY_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xaxpy: public Routine { - public: - - // Constructor - Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY"); - - // Templated-precision implementation of the routine - StatusCode DoAxpy(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XAXPY_H_ -#endif diff --git a/include/internal/routines/level1/xcopy.h b/include/internal/routines/level1/xcopy.h deleted file mode 100644 index c7d03dd0..00000000 --- a/include/internal/routines/level1/xcopy.h +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xcopy routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XCOPY_H_ -#define CLBLAST_ROUTINES_XCOPY_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xcopy: public Routine { - public: - - // Constructor - Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY"); - - // Templated-precision implementation of the routine - StatusCode DoCopy(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XCOPY_H_ -#endif diff --git a/include/internal/routines/level1/xdot.h b/include/internal/routines/level1/xdot.h deleted file mode 100644 index e1968740..00000000 --- a/include/internal/routines/level1/xdot.h +++ /dev/null @@ -1,42 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xdot routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XDOT_H_ -#define CLBLAST_ROUTINES_XDOT_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xdot: public Routine { - public: - - // Constructor - Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT"); - - // Templated-precision implementation of the routine - StatusCode DoDot(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const bool do_conjugate = false); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XDOT_H_ -#endif diff --git a/include/internal/routines/level1/xdotc.h b/include/internal/routines/level1/xdotc.h deleted file mode 100644 index 0dc2cfe9..00000000 --- a/include/internal/routines/level1/xdotc.h +++ /dev/null @@ -1,44 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xdotc routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XDOTC_H_ -#define CLBLAST_ROUTINES_XDOTC_H_ - -#include "internal/routines/level1/xdot.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xdotc: public Xdot<T> { - public: - - // Uses the regular Xdot routine - using Xdot<T>::DoDot; - - // Constructor - Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC"); - - // Templated-precision implementation of the routine - StatusCode DoDotc(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XDOTC_H_ -#endif diff --git a/include/internal/routines/level1/xdotu.h b/include/internal/routines/level1/xdotu.h deleted file mode 100644 index 98988744..00000000 --- a/include/internal/routines/level1/xdotu.h +++ /dev/null @@ -1,44 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xdotu routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XDOTU_H_ -#define CLBLAST_ROUTINES_XDOTU_H_ - -#include "internal/routines/level1/xdot.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xdotu: public Xdot<T> { - public: - - // Uses the regular Xdot routine - using Xdot<T>::DoDot; - - // Constructor - Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU"); - - // Templated-precision implementation of the routine - StatusCode DoDotu(const size_t n, - const Buffer<T> &dot_buffer, const size_t dot_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XDOTU_H_ -#endif diff --git a/include/internal/routines/level1/xmax.h b/include/internal/routines/level1/xmax.h deleted file mode 100644 index a872cede..00000000 --- a/include/internal/routines/level1/xmax.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xmax routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XMAX_H_ -#define CLBLAST_ROUTINES_XMAX_H_ - -#include "internal/routine.h" -#include "internal/routines/level1/xamax.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xmax: public Xamax<T> { - public: - - // Members and methods from the base class - using Xamax<T>::DoAmax; - - // Constructor - Xmax(Queue &queue, EventPointer event, const std::string &name = "MAX"): - Xamax<T>(queue, event, name) { - } - - // Forwards to the regular absolute version. The implementation difference is realised in the - // kernel through a pre-processor macro based on the name of the routine. - StatusCode DoMax(const size_t n, - const Buffer<unsigned int> &imax_buffer, const size_t imax_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { - return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XMAX_H_ -#endif diff --git a/include/internal/routines/level1/xmin.h b/include/internal/routines/level1/xmin.h deleted file mode 100644 index 700c81cc..00000000 --- a/include/internal/routines/level1/xmin.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xmin routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XMIN_H_ -#define CLBLAST_ROUTINES_XMIN_H_ - -#include "internal/routine.h" -#include "internal/routines/level1/xamax.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xmin: public Xamax<T> { - public: - - // Members and methods from the base class - using Xamax<T>::DoAmax; - - // Constructor - Xmin(Queue &queue, EventPointer event, const std::string &name = "MIN"): - Xamax<T>(queue, event, name) { - } - - // Forwards to the regular max-absolute version. The implementation difference is realised in the - // kernel through a pre-processor macro based on the name of the routine. - StatusCode DoMin(const size_t n, - const Buffer<unsigned int> &imin_buffer, const size_t imin_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { - return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XMIN_H_ -#endif diff --git a/include/internal/routines/level1/xnrm2.h b/include/internal/routines/level1/xnrm2.h deleted file mode 100644 index ca9268c0..00000000 --- a/include/internal/routines/level1/xnrm2.h +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xnrm2 routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XNRM2_H_ -#define CLBLAST_ROUTINES_XNRM2_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xnrm2: public Routine { - public: - - // Constructor - Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2"); - - // Templated-precision implementation of the routine - StatusCode DoNrm2(const size_t n, - const Buffer<T> &nrm2_buffer, const size_t nrm2_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XNRM2_H_ -#endif diff --git a/include/internal/routines/level1/xscal.h b/include/internal/routines/level1/xscal.h deleted file mode 100644 index b9430f3b..00000000 --- a/include/internal/routines/level1/xscal.h +++ /dev/null @@ -1,39 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xscal routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSCAL_H_ -#define CLBLAST_ROUTINES_XSCAL_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xscal: public Routine { - public: - - // Constructor - Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL"); - - // Templated-precision implementation of the routine - StatusCode DoScal(const size_t n, const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSCAL_H_ -#endif diff --git a/include/internal/routines/level1/xsum.h b/include/internal/routines/level1/xsum.h deleted file mode 100644 index 2f633b52..00000000 --- a/include/internal/routines/level1/xsum.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xsum routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSUM_H_ -#define CLBLAST_ROUTINES_XSUM_H_ - -#include "internal/routine.h" -#include "internal/routines/level1/xasum.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xsum: public Xasum<T> { - public: - - // Members and methods from the base class - using Xasum<T>::DoAsum; - - // Constructor - Xsum(Queue &queue, EventPointer event, const std::string &name = "SUM"): - Xasum<T>(queue, event, name) { - } - - // Forwards to the regular absolute version. The implementation difference is realised in the - // kernel through a pre-processor macro based on the name of the routine. - StatusCode DoSum(const size_t n, - const Buffer<T> &sum_buffer, const size_t sum_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { - return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc); - } -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSUM_H_ -#endif diff --git a/include/internal/routines/level1/xswap.h b/include/internal/routines/level1/xswap.h deleted file mode 100644 index bd063afc..00000000 --- a/include/internal/routines/level1/xswap.h +++ /dev/null @@ -1,40 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xswap routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSWAP_H_ -#define CLBLAST_ROUTINES_XSWAP_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xswap: public Routine { - public: - - // Constructor - Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP"); - - // Templated-precision implementation of the routine - StatusCode DoSwap(const size_t n, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSWAP_H_ -#endif diff --git a/include/internal/routines/level2/xgbmv.h b/include/internal/routines/level2/xgbmv.h deleted file mode 100644 index bc94c77d..00000000 --- a/include/internal/routines/level2/xgbmv.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xgbmv routine. It is based on the generalized mat-vec multiplication -// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the -// "MatVec" function directly. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XGBMV_H_ -#define CLBLAST_ROUTINES_XGBMV_H_ - -#include "internal/routines/level2/xgemv.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xgbmv: public Xgemv<T> { - public: - - // Uses the generic matrix-vector routine - using Xgemv<T>::MatVec; - - // Constructor - Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV"); - - // Templated-precision implementation of the routine - StatusCode DoGbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XGBMV_H_ -#endif diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h deleted file mode 100644 index e9804c62..00000000 --- a/include/internal/routines/level2/xgemv.h +++ /dev/null @@ -1,56 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xgemv routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XGEMV_H_ -#define CLBLAST_ROUTINES_XGEMV_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xgemv: public Routine { - public: - - // Constructor - Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV"); - - // Templated-precision implementation of the routine - StatusCode DoGemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); - - // Generic version used also for other matrix-vector multiplications - StatusCode MatVec(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - bool fast_kernel, bool fast_kernel_rot, - const size_t parameter, const bool packed, - const size_t kl, const size_t ku); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XGEMV_H_ -#endif diff --git a/include/internal/routines/level2/xger.h b/include/internal/routines/level2/xger.h deleted file mode 100644 index 184f8477..00000000 --- a/include/internal/routines/level2/xger.h +++ /dev/null @@ -1,43 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xger routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XGER_H_ -#define CLBLAST_ROUTINES_XGER_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xger: public Routine { - public: - - // Constructor - Xger(Queue &queue, EventPointer event, const std::string &name = "GER"); - - // Templated-precision implementation of the routine - StatusCode DoGer(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XGER_H_ -#endif diff --git a/include/internal/routines/level2/xgerc.h b/include/internal/routines/level2/xgerc.h deleted file mode 100644 index 6d06ef94..00000000 --- a/include/internal/routines/level2/xgerc.h +++ /dev/null @@ -1,46 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xgerc routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XGERC_H_ -#define CLBLAST_ROUTINES_XGERC_H_ - -#include "internal/routines/level2/xger.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xgerc: public Xger<T> { - public: - - // Uses the regular Xger routine - using Xger<T>::DoGer; - - // Constructor - Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC"); - - // Templated-precision implementation of the routine - StatusCode DoGerc(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XGERC_H_ -#endif diff --git a/include/internal/routines/level2/xgeru.h b/include/internal/routines/level2/xgeru.h deleted file mode 100644 index 45ce1cba..00000000 --- a/include/internal/routines/level2/xgeru.h +++ /dev/null @@ -1,46 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xgeru routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XGERU_H_ -#define CLBLAST_ROUTINES_XGERU_H_ - -#include "internal/routines/level2/xger.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xgeru: public Xger<T> { - public: - - // Uses the regular Xger routine - using Xger<T>::DoGer; - - // Constructor - Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU"); - - // Templated-precision implementation of the routine - StatusCode DoGeru(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XGERU_H_ -#endif diff --git a/include/internal/routines/level2/xhbmv.h b/include/internal/routines/level2/xhbmv.h deleted file mode 100644 index f0a6212c..00000000 --- a/include/internal/routines/level2/xhbmv.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xhbmv routine. It is based on the generalized mat-vec multiplication -// routine (Xgemv). The Xhbmv class inherits from the templated class Xgemv, allowing it to call the -// "MatVec" function directly. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHBMV_H_ -#define CLBLAST_ROUTINES_XHBMV_H_ - -#include "internal/routines/level2/xgemv.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xhbmv: public Xgemv<T> { - public: - - // Uses the generic matrix-vector routine - using Xgemv<T>::MatVec; - - // Constructor - Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV"); - - // Templated-precision implementation of the routine - StatusCode DoHbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHBMV_H_ -#endif diff --git a/include/internal/routines/level2/xhemv.h b/include/internal/routines/level2/xhemv.h deleted file mode 100644 index 3daf2457..00000000 --- a/include/internal/routines/level2/xhemv.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xhemv routine. It is based on the generalized mat-vec multiplication -// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the -// "MatVec" function directly. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHEMV_H_ -#define CLBLAST_ROUTINES_XHEMV_H_ - -#include "internal/routines/level2/xgemv.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xhemv: public Xgemv<T> { - public: - - // Uses the generic matrix-vector routine - using Xgemv<T>::MatVec; - - // Constructor - Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV"); - - // Templated-precision implementation of the routine - StatusCode DoHemv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHEMV_H_ -#endif diff --git a/include/internal/routines/level2/xher.h b/include/internal/routines/level2/xher.h deleted file mode 100644 index fca8bb97..00000000 --- a/include/internal/routines/level2/xher.h +++ /dev/null @@ -1,46 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xher routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHER_H_ -#define CLBLAST_ROUTINES_XHER_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T, typename U> -class Xher: public Routine { - public: - - // Constructor - Xher(Queue &queue, EventPointer event, const std::string &name = "HER"); - - // Translates alpha of type 'U' into type 'T' - T GetAlpha(const U alpha); - - // Templated-precision implementation of the routine - StatusCode DoHer(const Layout layout, const Triangle triangle, - const size_t n, - const U alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const bool packed = false); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHER_H_ -#endif diff --git a/include/internal/routines/level2/xher2.h b/include/internal/routines/level2/xher2.h deleted file mode 100644 index 9a7610f1..00000000 --- a/include/internal/routines/level2/xher2.h +++ /dev/null @@ -1,44 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xher2 routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHER2_H_ -#define CLBLAST_ROUTINES_XHER2_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xher2: public Routine { - public: - - // Constructor - Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2"); - - // Templated-precision implementation of the routine - StatusCode DoHer2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const bool packed = false); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHER2_H_ -#endif diff --git a/include/internal/routines/level2/xhpmv.h b/include/internal/routines/level2/xhpmv.h deleted file mode 100644 index a1d5595a..00000000 --- a/include/internal/routines/level2/xhpmv.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xhpmv routine. It is based on the generalized mat-vec multiplication -// routine (Xgemv). The Xhpmv class inherits from the templated class Xgemv, allowing it to call the -// "MatVec" function directly. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHPMV_H_ -#define CLBLAST_ROUTINES_XHPMV_H_ - -#include "internal/routines/level2/xgemv.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xhpmv: public Xgemv<T> { - public: - - // Uses the generic matrix-vector routine - using Xgemv<T>::MatVec; - - // Constructor - Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV"); - - // Templated-precision implementation of the routine - StatusCode DoHpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHPMV_H_ -#endif diff --git a/include/internal/routines/level2/xhpr.h b/include/internal/routines/level2/xhpr.h deleted file mode 100644 index 6554d74c..00000000 --- a/include/internal/routines/level2/xhpr.h +++ /dev/null @@ -1,45 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xhpr routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHPR_H_ -#define CLBLAST_ROUTINES_XHPR_H_ - -#include "internal/routines/level2/xher.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T, typename U> -class Xhpr: public Xher<T,U> { - public: - - // Uses the regular Xher routine - using Xher<T,U>::DoHer; - - // Constructor - Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR"); - - // Templated-precision implementation of the routine - StatusCode DoHpr(const Layout layout, const Triangle triangle, - const size_t n, - const U alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHPR_H_ -#endif diff --git a/include/internal/routines/level2/xhpr2.h b/include/internal/routines/level2/xhpr2.h deleted file mode 100644 index d95e7b61..00000000 --- a/include/internal/routines/level2/xhpr2.h +++ /dev/null @@ -1,46 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xhpr2 routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHPR2_H_ -#define CLBLAST_ROUTINES_XHPR2_H_ - -#include "internal/routines/level2/xher2.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xhpr2: public Xher2<T> { - public: - - // Uses the regular Xher2 routine - using Xher2<T>::DoHer2; - - // Constructor - Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2"); - - // Templated-precision implementation of the routine - StatusCode DoHpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHPR2_H_ -#endif diff --git a/include/internal/routines/level2/xsbmv.h b/include/internal/routines/level2/xsbmv.h deleted file mode 100644 index 4328e377..00000000 --- a/include/internal/routines/level2/xsbmv.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xsbmv routine. It is based on the generalized mat-vec multiplication -// routine (Xgemv). The Xsbmv class inherits from the templated class Xgemv, allowing it to call the -// "MatVec" function directly. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSBMV_H_ -#define CLBLAST_ROUTINES_XSBMV_H_ - -#include "internal/routines/level2/xgemv.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xsbmv: public Xgemv<T> { - public: - - // Uses the generic matrix-vector routine - using Xgemv<T>::MatVec; - - // Constructor - Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV"); - - // Templated-precision implementation of the routine - StatusCode DoSbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSBMV_H_ -#endif diff --git a/include/internal/routines/level2/xspmv.h b/include/internal/routines/level2/xspmv.h deleted file mode 100644 index ca3e28b6..00000000 --- a/include/internal/routines/level2/xspmv.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xspmv routine. It is based on the generalized mat-vec multiplication -// routine (Xgemv). The Xspmv class inherits from the templated class Xgemv, allowing it to call the -// "MatVec" function directly. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSPMV_H_ -#define CLBLAST_ROUTINES_XSPMV_H_ - -#include "internal/routines/level2/xgemv.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xspmv: public Xgemv<T> { - public: - - // Uses the generic matrix-vector routine - using Xgemv<T>::MatVec; - - // Constructor - Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV"); - - // Templated-precision implementation of the routine - StatusCode DoSpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSPMV_H_ -#endif diff --git a/include/internal/routines/level2/xspr.h b/include/internal/routines/level2/xspr.h deleted file mode 100644 index 7e91abc5..00000000 --- a/include/internal/routines/level2/xspr.h +++ /dev/null @@ -1,45 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xspr routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSPR_H_ -#define CLBLAST_ROUTINES_XSPR_H_ - -#include "internal/routines/level2/xher.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xspr: public Xher<T,T> { - public: - - // Uses the regular Xher routine - using Xher<T,T>::DoHer; - - // Constructor - Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR"); - - // Templated-precision implementation of the routine - StatusCode DoSpr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSPR_H_ -#endif diff --git a/include/internal/routines/level2/xspr2.h b/include/internal/routines/level2/xspr2.h deleted file mode 100644 index a34be8e8..00000000 --- a/include/internal/routines/level2/xspr2.h +++ /dev/null @@ -1,46 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xspr2 routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSPR2_H_ -#define CLBLAST_ROUTINES_XSPR2_H_ - -#include "internal/routines/level2/xher2.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xspr2: public Xher2<T> { - public: - - // Uses the regular Xher2 routine - using Xher2<T>::DoHer2; - - // Constructor - Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2"); - - // Templated-precision implementation of the routine - StatusCode DoSpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &ap_buffer, const size_t ap_offset); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSPR2_H_ -#endif diff --git a/include/internal/routines/level2/xsymv.h b/include/internal/routines/level2/xsymv.h deleted file mode 100644 index 98a0ce88..00000000 --- a/include/internal/routines/level2/xsymv.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xsymv routine. It is based on the generalized mat-vec multiplication -// routine (Xgemv). The Xsymv class inherits from the templated class Xgemv, allowing it to call the -// "MatVec" function directly. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSYMV_H_ -#define CLBLAST_ROUTINES_XSYMV_H_ - -#include "internal/routines/level2/xgemv.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xsymv: public Xgemv<T> { - public: - - // Uses the generic matrix-vector routine - using Xgemv<T>::MatVec; - - // Constructor - Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV"); - - // Templated-precision implementation of the routine - StatusCode DoSymv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSYMV_H_ -#endif diff --git a/include/internal/routines/level2/xsyr.h b/include/internal/routines/level2/xsyr.h deleted file mode 100644 index f88498ae..00000000 --- a/include/internal/routines/level2/xsyr.h +++ /dev/null @@ -1,45 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xsyr routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSYR_H_ -#define CLBLAST_ROUTINES_XSYR_H_ - -#include "internal/routines/level2/xher.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xsyr: public Xher<T,T> { - public: - - // Uses the regular Xher routine - using Xher<T,T>::DoHer; - - // Constructor - Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR"); - - // Templated-precision implementation of the routine - StatusCode DoSyr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSYR_H_ -#endif diff --git a/include/internal/routines/level2/xsyr2.h b/include/internal/routines/level2/xsyr2.h deleted file mode 100644 index d2d3143a..00000000 --- a/include/internal/routines/level2/xsyr2.h +++ /dev/null @@ -1,46 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xsyr2 routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSYR2_H_ -#define CLBLAST_ROUTINES_XSYR2_H_ - -#include "internal/routines/level2/xher2.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xsyr2: public Xher2<T> { - public: - - // Uses the regular Xher2 routine - using Xher2<T>::DoHer2; - - // Constructor - Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2"); - - // Templated-precision implementation of the routine - StatusCode DoSyr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSYR2_H_ -#endif diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h deleted file mode 100644 index 493a9853..00000000 --- a/include/internal/routines/level2/xtbmv.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication -// routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the -// "MatVec" function directly. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XTBMV_H_ -#define CLBLAST_ROUTINES_XTBMV_H_ - -#include "internal/routines/level2/xgemv.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xtbmv: public Xgemv<T> { - public: - - // Uses the generic matrix-vector routine - using Xgemv<T>::queue_; - using Xgemv<T>::context_; - using Xgemv<T>::MatVec; - - // Constructor - Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV"); - - // Templated-precision implementation of the routine - StatusCode DoTbmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XTBMV_H_ -#endif diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h deleted file mode 100644 index ce5cae6f..00000000 --- a/include/internal/routines/level2/xtpmv.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication -// routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the -// "MatVec" function directly. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XTPMV_H_ -#define CLBLAST_ROUTINES_XTPMV_H_ - -#include "internal/routines/level2/xgemv.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xtpmv: public Xgemv<T> { - public: - - // Uses the generic matrix-vector routine - using Xgemv<T>::queue_; - using Xgemv<T>::context_; - using Xgemv<T>::MatVec; - - // Constructor - Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV"); - - // Templated-precision implementation of the routine - StatusCode DoTpmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const Buffer<T> &ap_buffer, const size_t ap_offset, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XTPMV_H_ -#endif diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h deleted file mode 100644 index 4407bad7..00000000 --- a/include/internal/routines/level2/xtrmv.h +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication -// routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the -// "MatVec" function directly. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XTRMV_H_ -#define CLBLAST_ROUTINES_XTRMV_H_ - -#include "internal/routines/level2/xgemv.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xtrmv: public Xgemv<T> { - public: - - // Uses the generic matrix-vector routine - using Xgemv<T>::queue_; - using Xgemv<T>::context_; - using Xgemv<T>::MatVec; - - // Constructor - Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV"); - - // Templated-precision implementation of the routine - StatusCode DoTrmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XTRMV_H_ -#endif diff --git a/include/internal/routines/level3/xgemm.h b/include/internal/routines/level3/xgemm.h deleted file mode 100644 index c0541eef..00000000 --- a/include/internal/routines/level3/xgemm.h +++ /dev/null @@ -1,48 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xgemm routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XGEMM_H_ -#define CLBLAST_ROUTINES_XGEMM_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xgemm: public Routine { - public: - - // Constructor - Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM"); - - // Templated-precision implementation of the routine - StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); - - protected: - // Static variable to get the precision - const static Precision precision_; -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XGEMM_H_ -#endif diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h deleted file mode 100644 index e0f35669..00000000 --- a/include/internal/routines/level3/xhemm.h +++ /dev/null @@ -1,54 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xhemm routine. It is based on the generalized matrix multiplication -// routine (Xgemm). The implementation is very similar to the Xsymm routine. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHEMM_H_ -#define CLBLAST_ROUTINES_XHEMM_H_ - -#include "internal/routines/level3/xgemm.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xhemm: public Xgemm<T> { - public: - - // Uses methods and variables the regular Xgemm routine - using Xgemm<T>::precision_; - using Xgemm<T>::routine_name_; - using Xgemm<T>::queue_; - using Xgemm<T>::context_; - using Xgemm<T>::device_; - using Xgemm<T>::db_; - using Xgemm<T>::DoGemm; - - // Constructor - Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM"); - - // Templated-precision implementation of the routine - StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHEMM_H_ -#endif diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h deleted file mode 100644 index b7764e18..00000000 --- a/include/internal/routines/level3/xher2k.h +++ /dev/null @@ -1,46 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xher2k routine. The precision is implemented using the template argument -// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the -// Xsyr2k routine. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHER2K_H_ -#define CLBLAST_ROUTINES_XHER2K_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T, typename U> -class Xher2k: public Routine { - public: - - // Constructor - Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K"); - - // Templated-precision implementation of the routine - StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const U beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHER2K_H_ -#endif diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h deleted file mode 100644 index abcf4c1a..00000000 --- a/include/internal/routines/level3/xherk.h +++ /dev/null @@ -1,45 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xherk routine. The precision is implemented using the template argument -// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the -// Xsyrk routine. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XHERK_H_ -#define CLBLAST_ROUTINES_XHERK_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T, typename U> -class Xherk: public Routine { - public: - - // Constructor - Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK"); - - // Templated-precision implementation of the routine - StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const U alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const U beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XHERK_H_ -#endif diff --git a/include/internal/routines/level3/xsymm.h b/include/internal/routines/level3/xsymm.h deleted file mode 100644 index 889abfb7..00000000 --- a/include/internal/routines/level3/xsymm.h +++ /dev/null @@ -1,56 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xsymm routine. It is based on the generalized matrix multiplication -// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the -// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by -// transforming it into a general matrix, and then calls the regular GEMM code. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSYMM_H_ -#define CLBLAST_ROUTINES_XSYMM_H_ - -#include "internal/routines/level3/xgemm.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xsymm: public Xgemm<T> { - public: - - // Uses methods and variables the regular Xgemm routine - using Xgemm<T>::precision_; - using Xgemm<T>::routine_name_; - using Xgemm<T>::queue_; - using Xgemm<T>::context_; - using Xgemm<T>::device_; - using Xgemm<T>::db_; - using Xgemm<T>::DoGemm; - - // Constructor - Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM"); - - // Templated-precision implementation of the routine - StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSYMM_H_ -#endif diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h deleted file mode 100644 index f75c91e5..00000000 --- a/include/internal/routines/level3/xsyr2k.h +++ /dev/null @@ -1,46 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xsyr2k routine. The precision is implemented using a template argument. -// The implementation is very similar to Xsyrk (see header for details), except for the fact that -// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSYR2K_H_ -#define CLBLAST_ROUTINES_XSYR2K_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xsyr2k: public Routine { - public: - - // Constructor - Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K"); - - // Templated-precision implementation of the routine - StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSYR2K_H_ -#endif diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h deleted file mode 100644 index 0710fa74..00000000 --- a/include/internal/routines/level3/xsyrk.h +++ /dev/null @@ -1,47 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xsyrk routine. The precision is implemented using a template argument. -// The implementation is based on the regular Xgemm routine and kernel, but with two main changes: -// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part. -// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for -// performance reasons, as the actual masking is done later (see the first point). -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XSYRK_H_ -#define CLBLAST_ROUTINES_XSYRK_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xsyrk: public Routine { - public: - - // Constructor - Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK"); - - // Templated-precision implementation of the routine - StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XSYRK_H_ -#endif diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h deleted file mode 100644 index e18ad17a..00000000 --- a/include/internal/routines/level3/xtrmm.h +++ /dev/null @@ -1,54 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xtrmm routine. The implementation is based on first transforming the -// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM -// routine. Therefore, this class inherits from the Xgemm class. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XTRMM_H_ -#define CLBLAST_ROUTINES_XTRMM_H_ - -#include "internal/routines/level3/xgemm.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xtrmm: public Xgemm<T> { - public: - - // Uses methods and variables the regular Xgemm routine - using Xgemm<T>::precision_; - using Xgemm<T>::routine_name_; - using Xgemm<T>::queue_; - using Xgemm<T>::context_; - using Xgemm<T>::device_; - using Xgemm<T>::db_; - using Xgemm<T>::DoGemm; - - // Constructor - Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM"); - - // Templated-precision implementation of the routine - StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XTRMM_H_ -#endif diff --git a/include/internal/routines/levelx/xomatcopy.h b/include/internal/routines/levelx/xomatcopy.h deleted file mode 100644 index d2acb50d..00000000 --- a/include/internal/routines/levelx/xomatcopy.h +++ /dev/null @@ -1,41 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the Xomatcopy routine. The precision is implemented using a template argument. -// -// ================================================================================================= - -#ifndef CLBLAST_ROUTINES_XOMATCOPY_H_ -#define CLBLAST_ROUTINES_XOMATCOPY_H_ - -#include "internal/routine.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template <typename T> -class Xomatcopy: public Routine { - public: - - // Constructor - Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY"); - - // Templated-precision implementation of the routine - StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld); -}; - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_ROUTINES_XOMATCOPY_H_ -#endif diff --git a/include/internal/tuning.h b/include/internal/tuning.h deleted file mode 100644 index a44f79d6..00000000 --- a/include/internal/tuning.h +++ /dev/null @@ -1,161 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file implements the interface to the CLTune auto-tuner. This is only used for the optional -// and stand-alone tuner binaries and not part of the core of CLBlast. -// -// ================================================================================================= - -#ifndef CLBLAST_TUNING_H_ -#define CLBLAST_TUNING_H_ - -#include <vector> -#include <string> - -#include <cltune.h> - -#include "internal/utilities.h" - -namespace clblast { -// ================================================================================================= - -// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect -// the results. Used for all types of kernel families. Note that this is a header-only function so -// that it is automatically compiled for the various kernels (given as the 'C' template argument). -template <typename C, typename T> -void Tuner(int argc, char* argv[]) { - - // Sets the parameters and platform/device for which to tune (command-line options) - auto help = std::string{"* Options given/available:\n"}; - auto args = Arguments<T>{}; - args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); - args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); - args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); - for (auto &o: C::GetOptions()) { - if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, C::DefaultM()); } - if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, C::DefaultN()); } - if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, C::DefaultK()); } - if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); } - if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); } - if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); } - } - fprintf(stdout, "%s\n", help.c_str()); - - // Tests validity of the given arguments - C::TestValidArguments(args); - - // Tests for validity of the precision and retrieves properties - auto isAMD = false; - auto isARM = false; - auto isGPU = false; - { - const auto platform = Platform(args.platform_id); - const auto device = Device(platform, args.device_id); - if (!PrecisionSupported<T>(device)) { - printf("* Unsupported precision, skipping this tuning run\n\n"); - return; - } - isAMD = device.IsAMD(); - isARM = device.IsARM(); - isGPU = device.IsGPU(); - } - - // Creates input buffers with random data - auto x_vec = std::vector<T>(C::GetSizeX(args)); - auto y_vec = std::vector<T>(C::GetSizeY(args)); - auto a_mat = std::vector<T>(C::GetSizeA(args)); - auto b_mat = std::vector<T>(C::GetSizeB(args)); - auto c_mat = std::vector<T>(C::GetSizeC(args)); - auto temp = std::vector<T>(C::GetSizeTemp(args)); - PopulateVector(x_vec); - PopulateVector(y_vec); - PopulateVector(a_mat); - PopulateVector(b_mat); - PopulateVector(c_mat); - PopulateVector(temp); - - // Initializes the tuner for the chosen device - cltune::Tuner tuner(args.platform_id, args.device_id); - - // Use full-search to explore all parameter combinations or random-search to search only a part of - // the parameter values. The fraction is set as a command-line argument. - if (args.fraction == 1.0 || args.fraction == 0.0) { - tuner.UseFullSearch(); - } - else { - tuner.UseRandomSearch(1.0/args.fraction); - } - - // Set extra settings for specific defines. This mimics src/routine.cc. - auto defines = std::string{""}; - if (isAMD && isGPU) { - defines += "#define USE_CL_MAD 1\n"; - defines += "#define USE_STAGGERED_INDICES 1\n"; - } - if (isARM && isGPU) { - defines += "#define GLOBAL_MEM_FENCE 1\n"; - } - - // Loads the kernel sources and defines the kernel to tune - auto sources = defines + C::GetSources(); - auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize()); - tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef()); - - // Sets the tunable parameters and their possible values - C::SetParameters(tuner, id); - C::SetConstraints(tuner, id); - C::SetLocalMemorySize(tuner, id, args); - - // Tests for a specific precision - tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); - tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); - - // Modifies the thread-sizes (both global and local) based on the parameters - for (auto ¶meters: C::MulLocal()) { tuner.MulLocalSize(id, parameters); } - for (auto ¶meters: C::DivLocal()) { tuner.DivLocalSize(id, parameters); } - for (auto ¶meters: C::MulGlobal()) { tuner.MulGlobalSize(id, parameters); } - for (auto ¶meters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); } - - // Sets the function's arguments - C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp); - - // Starts the tuning process - tuner.Tune(); - - // Prints the results to screen - auto time_ms = tuner.PrintToScreen(); - tuner.PrintFormatted(); - - // Also prints the performance of the best-case in terms of GB/s or GFLOPS - if (time_ms != 0.0) { - printf("[ -------> ] %.1lf ms", time_ms); - printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str()); - } - - // Outputs the results as JSON to disk, including some meta-data - auto precision_string = std::to_string(static_cast<size_t>(args.precision)); - auto metadata = std::vector<std::pair<std::string,std::string>>{ - {"kernel_family", C::KernelFamily()}, - {"precision", precision_string} - }; - for (auto &o: C::GetOptions()) { - if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); } - if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); } - if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); } - if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); } - if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); } - } - tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata); -} - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_TUNING_H_ -#endif diff --git a/include/internal/utilities.h b/include/internal/utilities.h deleted file mode 100644 index 7092bcdd..00000000 --- a/include/internal/utilities.h +++ /dev/null @@ -1,257 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file provides declarations for the common (test) utility functions such as a command-line -// argument parser. On top of this, it serves as the 'common' header, including the C++ OpenCL -// wrapper. These utilities are not only used for CLBlast, but also included as part of the tuners, -// the performance client and the correctness testers. -// -// ================================================================================================= - -#ifndef CLBLAST_UTILITIES_H_ -#define CLBLAST_UTILITIES_H_ - -#include <string> -#include <functional> -#include <complex> - -#include "clblast.h" -#include "clblast_half.h" -#include "internal/clpp11.h" - -namespace clblast { -// ================================================================================================= - -// Shorthands for complex data-types -using float2 = std::complex<float>; -using double2 = std::complex<double>; - -// Khronos OpenCL extensions -const std::string kKhronosHalfPrecision = "cl_khr_fp16"; -const std::string kKhronosDoublePrecision = "cl_khr_fp64"; - -// Catched an unknown error -constexpr auto kUnknownError = -999; - -// ================================================================================================= - -// The routine-specific arguments in string form -constexpr auto kArgM = "m"; -constexpr auto kArgN = "n"; -constexpr auto kArgK = "k"; -constexpr auto kArgKL = "kl"; -constexpr auto kArgKU = "ku"; -constexpr auto kArgLayout = "layout"; -constexpr auto kArgATransp = "transA"; -constexpr auto kArgBTransp = "transB"; -constexpr auto kArgSide = "side"; -constexpr auto kArgTriangle = "triangle"; -constexpr auto kArgDiagonal = "diagonal"; -constexpr auto kArgXInc = "incx"; -constexpr auto kArgYInc = "incy"; -constexpr auto kArgXOffset = "offx"; -constexpr auto kArgYOffset = "offy"; -constexpr auto kArgALeadDim = "lda"; -constexpr auto kArgBLeadDim = "ldb"; -constexpr auto kArgCLeadDim = "ldc"; -constexpr auto kArgAOffset = "offa"; -constexpr auto kArgBOffset = "offb"; -constexpr auto kArgCOffset = "offc"; -constexpr auto kArgAPOffset = "offap"; -constexpr auto kArgDotOffset = "offdot"; -constexpr auto kArgNrm2Offset = "offnrm2"; -constexpr auto kArgAsumOffset = "offasum"; -constexpr auto kArgImaxOffset = "offimax"; -constexpr auto kArgAlpha = "alpha"; -constexpr auto kArgBeta = "beta"; - -// The tuner-specific arguments in string form -constexpr auto kArgFraction = "fraction"; - -// The client-specific arguments in string form -constexpr auto kArgCompareclblas = "clblas"; -constexpr auto kArgComparecblas = "cblas"; -constexpr auto kArgStepSize = "step"; -constexpr auto kArgNumSteps = "num_steps"; -constexpr auto kArgNumRuns = "runs"; - -// The client-specific arguments in string form -constexpr auto kArgFullTest = "full_test"; -constexpr auto kArgVerbose = "verbose"; - -// The common arguments in string form -constexpr auto kArgPlatform = "platform"; -constexpr auto kArgDevice = "device"; -constexpr auto kArgPrecision = "precision"; -constexpr auto kArgHelp = "h"; -constexpr auto kArgQuiet = "q"; -constexpr auto kArgNoAbbreviations = "no_abbrv"; - -// ================================================================================================= - -// Returns a scalar with a default value -template <typename T> -T GetScalar(); - -// Returns a scalar of value 1 -template <typename T> -T ConstantOne(); - -// ================================================================================================= - -// Structure containing all possible arguments for test clients, including their default values -template <typename T> -struct Arguments { - // Routine-specific arguments - size_t m = 1; - size_t n = 1; - size_t k = 1; - size_t ku = 1; - size_t kl = 1; - Layout layout = Layout::kRowMajor; - Transpose a_transpose = Transpose::kNo; - Transpose b_transpose = Transpose::kNo; - Side side = Side::kLeft; - Triangle triangle = Triangle::kUpper; - Diagonal diagonal = Diagonal::kUnit; - size_t x_inc = 1; - size_t y_inc = 1; - size_t x_offset = 0; - size_t y_offset = 0; - size_t a_ld = 1; - size_t b_ld = 1; - size_t c_ld = 1; - size_t a_offset = 0; - size_t b_offset = 0; - size_t c_offset = 0; - size_t ap_offset = 0; - size_t dot_offset = 0; - size_t nrm2_offset = 0; - size_t asum_offset = 0; - size_t imax_offset = 0; - T alpha = ConstantOne<T>(); - T beta = ConstantOne<T>(); - size_t x_size = 1; - size_t y_size = 1; - size_t a_size = 1; - size_t b_size = 1; - size_t c_size = 1; - size_t ap_size = 1; - size_t scalar_size = 1; - // Tuner-specific arguments - double fraction = 1.0; - // Client-specific arguments - int compare_clblas = 1; - int compare_cblas = 1; - size_t step = 1; - size_t num_steps = 0; - size_t num_runs = 10; - // Common arguments - size_t platform_id = 0; - size_t device_id = 0; - Precision precision = Precision::kSingle; - bool print_help = false; - bool silent = false; - bool no_abbrv = false; -}; - -// Structure containing all possible buffers for test clients -template <typename T> -struct Buffers { - Buffer<T> x_vec; - Buffer<T> y_vec; - Buffer<T> a_mat; - Buffer<T> b_mat; - Buffer<T> c_mat; - Buffer<T> ap_mat; - Buffer<T> scalar; -}; - -// ================================================================================================= - -// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast -// data-types such as the Layout and Transpose data-types. -template <typename T> -std::string ToString(T value); - -// ================================================================================================= - -// Helper for the function "GetArgument" -template <typename T> -T ConvertArgument(const char* value); - -// Basic argument parser, matching patterns in the form of "-option value" and "--option value" -template <typename T> -T GetArgument(const int argc, char *argv[], std::string &help, - const std::string &option, const T default_value); - -// Returns the precision only -Precision GetPrecision(const int argc, char *argv[], - const Precision default_precision = Precision::kSingle); - -// As in "GetArgument", but now only checks whether an argument is given or not -bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option); - -// ================================================================================================= - -// Helper function to check for errors in the status code -constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); } - -// ================================================================================================= - -// Returns a random number to be used as a seed -unsigned int GetRandomSeed(); - -// Test/example data lower and upper limit -constexpr auto kTestDataLowerLimit = -2.0; -constexpr auto kTestDataUpperLimit = 2.0; - -// Populates a vector with random data -template <typename T> -void PopulateVector(std::vector<T> &vector); - -// ================================================================================================= - -// Conversion between half and single-precision -std::vector<float> HalfToFloatBuffer(const std::vector<half>& source); -void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source); - -// As above, but now for OpenCL data-types instead of std::vectors -Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw); -void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw); - -// ================================================================================================= - -// Rounding functions -size_t CeilDiv(const size_t x, const size_t y); -size_t Ceil(const size_t x, const size_t y); - -// Returns whether or not 'a' is a multiple of 'b' -bool IsMultiple(const size_t a, const size_t b); - -// ================================================================================================= - -// Convert the precision enum into bytes, e.g. a double takes up 8 bytes -size_t GetBytes(const Precision precision); - -// Convert the template argument into a precision value -template <typename T> -Precision PrecisionValue(); - -// ================================================================================================= - -// Returns false is this precision is not supported by the device -template <typename T> -bool PrecisionSupported(const Device &device); - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_UTILITIES_H_ -#endif |