summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorCNugteren <web@cedricnugteren.nl>2015-05-30 12:30:43 +0200
committerCNugteren <web@cedricnugteren.nl>2015-05-30 12:30:43 +0200
commitbc5a341dfe591946e925db315fc7d8c0c25c2938 (patch)
treeb216ab5eee4863e3807d92b5ddd19fa22197ed22 /include
parentc7b054ea6747039f4405fd93da6e924f3e5c7f4b (diff)
Initial commit of preview version
Diffstat (limited to 'include')
-rw-r--r--include/clblast.h125
-rw-r--r--include/internal/clpp11.h524
-rw-r--r--include/internal/database.h90
-rw-r--r--include/internal/database/copy.h130
-rw-r--r--include/internal/database/pad.h130
-rw-r--r--include/internal/database/padtranspose.h130
-rw-r--r--include/internal/database/transpose.h130
-rw-r--r--include/internal/database/xaxpy.h129
-rw-r--r--include/internal/database/xgemm.h133
-rw-r--r--include/internal/routine.h132
-rw-r--r--include/internal/routines/xaxpy.h42
-rw-r--r--include/internal/routines/xgemm.h46
-rw-r--r--include/internal/routines/xsymm.h60
-rw-r--r--include/internal/tuning.h53
-rw-r--r--include/internal/utilities.h174
15 files changed, 2028 insertions, 0 deletions
diff --git a/include/clblast.h b/include/clblast.h
new file mode 100644
index 00000000..4c3c5201
--- /dev/null
+++ b/include/clblast.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the interface to the CLBlast BLAS routines. It also contains the definitions
+// of the returned status codes and the layout and transpose types. This is the only header users
+// of CLBlast should include and use.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLBLAST_H_
+#define CLBLAST_CLBLAST_H_
+
+#include <cstdlib> // For size_t
+
+// Includes the normal OpenCL C header
+#if defined(__APPLE__) || defined(__MACOSX)
+ #include <OpenCL/opencl.h>
+#else
+ #include <CL/opencl.h>
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// Status codes. These codes can be returned by functions declared in this header file. The error
+// codes match either the standard OpenCL error codes or the clBLAS error codes.
+enum class StatusCode {
+
+ // Status codes in common with the OpenCL standard
+ kSuccess = 0, // CL_SUCCESS
+ kTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
+ kBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
+ kInvalidBinary = -42, // CL_INVALID_BINARY
+ kInvalidKernel = -48, // CL_INVALID_KERNEL
+ kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
+ kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
+ kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
+ kInvalidTempBufferSize = -61, // CL_INVALID_BUFFER_SIZE
+
+ // Status codes in common with the clBLAS library
+ kNotImplemented = -1024, // Routine or functionality not implemented yet
+ kInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer
+ kInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer
+ kInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer
+ kInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer
+ kInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer
+ kInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero
+ kInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension
+ kInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension
+ kInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension
+ kInvalidIncrementX = -1013, // Increment of vector X cannot be zero
+ kInvalidIncrementY = -1012, // Increment of vector Y cannot be zero
+ kInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small
+ kInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small
+ kInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small
+ kInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small
+ kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
+
+ // Custom additional status codes for CLBlast
+ kKernelLaunchError = -2048, // Problem occurred when enqueuing the kernel
+ kKernelRunError = -2047, // Problem occurred while running the kernel
+ kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
+ kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
+ kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
+};
+
+// Matrix layout and transpose types
+enum class Layout { kRowMajor, kColMajor };
+enum class Transpose { kNo, kYes, kConjugate };
+enum class Side { kLeft, kRight };
+enum class Triangle { kUpper, kLower };
+
+// Precision scoped enum (values in bits)
+enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
+ kComplexSingle = 3232, kComplexDouble = 6464 };
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+
+// Templated-precision vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
+template <typename T>
+StatusCode Axpy(const size_t m, const T alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event);
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+
+// Templated-precision generalized matrix multiplication: SGEMM/DGEMM
+template <typename T>
+StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
+ const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+
+// Templated-precision symmetric matrix multiplication: SSYMM/DSYMM
+template <typename T>
+StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
+ const size_t m, const size_t n,
+ const T alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+ cl_command_queue* queue, cl_event* event);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_CLBLAST_H_
+#endif
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
new file mode 100644
index 00000000..73040fdb
--- /dev/null
+++ b/include/internal/clpp11.h
@@ -0,0 +1,524 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a C++11 wrapper around some OpenCL C data-types, similar to Khronos' cl.hpp.
+// The main differences are modern C++11 support and a straightforward implemenation of the basic
+// needs (as required for this project). It also includes some extra functionality not available
+// in cl.hpp, such as including the sources with a Program object and querying a Kernel's validity
+// in terms of local memory usage.
+//
+// This file is adapted from the C++ bindings from the CLTune project and therefore contains the
+// following copyright notice:
+//
+// =================================================================================================
+//
+// Copyright 2014 SURFsara
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLPP11_H_
+#define CLBLAST_CLPP11_H_
+
+#include <utility> // std::swap
+#include <algorithm> // std::copy
+#include <string> // std::string
+#include <vector> // std::vector
+#include <stdexcept> // std::runtime_error
+
+// Includes the normal OpenCL C header
+#if defined(__APPLE__) || defined(__MACOSX)
+ #include <OpenCL/opencl.h>
+#else
+ #include <CL/opencl.h>
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// Base class for any object
+class Object {
+ protected:
+
+ // Error handling (NOTE: these functions are [[noreturn]])
+ void Error(const std::string &message) const {
+ throw std::runtime_error("Internal OpenCL error: "+message);
+ }
+ void Error(const cl_int status) const {
+ throw std::runtime_error("Internal OpenCL error with status: "+std::to_string(status));
+ }
+};
+
+// =================================================================================================
+
+// Base class for objects which require memory management
+class ObjectWithState: public Object {
+
+};
+
+// =================================================================================================
+
+// C++11 version of cl_event
+class Event: public Object {
+ public:
+
+ // Constructor based on the plain C data-type
+ explicit Event(const cl_event event): event_(event) { }
+
+ // New event
+ Event(): event_() {}
+
+ // Public functions
+ size_t GetProfilingStart() const {
+ auto bytes = size_t{0};
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
+ auto result = size_t{0};
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &result, nullptr);
+ return result;
+ }
+ size_t GetProfilingEnd() const {
+ auto bytes = size_t{0};
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
+ auto result = size_t{0};
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &result, nullptr);
+ return result;
+ }
+ cl_int Wait() const {
+ return clWaitForEvents(1, &event_);
+ }
+
+ // Accessors to the private data-member
+ cl_event operator()() const { return event_; }
+ cl_event& operator()() { return event_; }
+ private:
+ cl_event event_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_platform_id
+class Platform: public Object {
+ public:
+
+ // Constructor based on the plain C data-type
+ explicit Platform(const cl_platform_id platform): platform_(platform) { }
+
+ // Initialize the platform. Note that this constructor can throw exceptions!
+ explicit Platform(const size_t platform_id) {
+ auto num_platforms = cl_uint{0};
+ auto status = clGetPlatformIDs(0, nullptr, &num_platforms);
+ if (status != CL_SUCCESS) { Error(status); }
+ if (num_platforms == 0) { Error("no platforms found"); }
+ auto platforms = std::vector<cl_platform_id>(num_platforms);
+ status = clGetPlatformIDs(num_platforms, platforms.data(), nullptr);
+ if (status != CL_SUCCESS) { Error(status); }
+ if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
+ platform_ = platforms[platform_id];
+ }
+
+ // Accessors to the private data-member
+ cl_platform_id operator()() const { return platform_; }
+ cl_platform_id& operator()() { return platform_; }
+ private:
+ cl_platform_id platform_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_device_id
+class Device: public Object {
+ public:
+
+ // Constructor based on the plain C data-type
+ explicit Device(const cl_device_id device): device_(device) { }
+
+ // Initialize the device. Note that this constructor can throw exceptions!
+ explicit Device(const Platform &platform, const cl_device_type type, const size_t device_id) {
+ auto num_devices = cl_uint{0};
+ auto status = clGetDeviceIDs(platform(), type, 0, nullptr, &num_devices);
+ if (status != CL_SUCCESS) { Error(status); }
+ if (num_devices == 0) { Error("no devices found"); }
+ auto devices = std::vector<cl_device_id>(num_devices);
+ status = clGetDeviceIDs(platform(), type, num_devices, devices.data(), nullptr);
+ if (status != CL_SUCCESS) { Error(status); }
+ if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
+ device_ = devices[device_id];
+ }
+
+ // Public functions
+ std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); }
+ cl_device_type Type() const { return GetInfo<cl_device_type>(CL_DEVICE_TYPE); }
+ std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); }
+ std::string Name() const { return GetInfoString(CL_DEVICE_NAME); }
+ std::string Extensions() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
+ size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
+ cl_ulong LocalMemSize() const { return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE); }
+ cl_uint MaxWorkItemDimensions() const {
+ return GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
+ }
+ std::vector<size_t> MaxWorkItemSizes() const {
+ return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
+ }
+
+ // Configuration-validity checks
+ bool IsLocalMemoryValid(const size_t local_mem_usage) const {
+ return (local_mem_usage <= LocalMemSize());
+ }
+ bool IsThreadConfigValid(const std::vector<size_t> &local) const {
+ auto local_size = size_t{1};
+ for (auto &item: local) { local_size *= item; }
+ for (auto i=size_t{0}; i<local.size(); ++i) {
+ if (local[i] > MaxWorkItemSizes()[i]) { return false; }
+ }
+ if (local_size > MaxWorkGroupSize()) { return false; }
+ if (local.size() > MaxWorkItemDimensions()) { return false; }
+ return true;
+ }
+
+ // Accessors to the private data-member
+ cl_device_id operator()() const { return device_; }
+ cl_device_id& operator()() { return device_; }
+ private:
+
+ // Helper functions
+ template <typename T>
+ T GetInfo(const cl_device_info info) const {
+ auto bytes = size_t{0};
+ clGetDeviceInfo(device_, info, 0, nullptr, &bytes);
+ auto result = T(0);
+ clGetDeviceInfo(device_, info, bytes, &result, nullptr);
+ return result;
+ }
+ template <typename T>
+ std::vector<T> GetInfoVector(const cl_device_info info) const {
+ auto bytes = size_t{0};
+ clGetDeviceInfo(device_, info, 0, nullptr, &bytes);
+ auto result = std::vector<T>(bytes/sizeof(T));
+ clGetDeviceInfo(device_, info, bytes, result.data(), nullptr);
+ return result;
+ }
+ std::string GetInfoString(const cl_device_info info) const {
+ auto bytes = size_t{0};
+ clGetDeviceInfo(device_, info, 0, nullptr, &bytes);
+ auto result = std::vector<char>(bytes);
+ clGetDeviceInfo(device_, info, bytes, result.data(), nullptr);
+ return std::string(result.data());
+ }
+
+ cl_device_id device_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_context
+class Context: public ObjectWithState {
+ public:
+
+ // Constructor based on the plain C data-type
+ explicit Context(const cl_context context): context_(context) {
+ clRetainContext(context_);
+ }
+
+ // Memory management
+ explicit Context(const Device &device) {
+ auto status = CL_SUCCESS;
+ const cl_device_id dev = device();
+ context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
+ if (status != CL_SUCCESS) { Error(status); }
+ }
+ ~Context() {
+ clReleaseContext(context_);
+ }
+ Context(const Context &other):
+ context_(other.context_) {
+ clRetainContext(context_);
+ }
+ Context& operator=(Context other) {
+ swap(*this, other);
+ return *this;
+ }
+ friend void swap(Context &first, Context &second) {
+ std::swap(first.context_, second.context_);
+ }
+
+ // Accessors to the private data-member
+ cl_context operator()() const { return context_; }
+ cl_context& operator()() { return context_; }
+ private:
+ cl_context context_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_program. Additionally holds the program's source code.
+class Program: public ObjectWithState {
+ public:
+
+ // Note that there is no constructor based on the plain C data-type because of extra state
+
+ // Memory management
+ explicit Program(const Context &context, const std::string &source):
+ length_(source.length()) {
+ std::copy(source.begin(), source.end(), back_inserter(source_));
+ source_ptr_ = source_.data();
+ auto status = CL_SUCCESS;
+ program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
+ if (status != CL_SUCCESS) { Error(status); }
+ }
+ ~Program() {
+ clReleaseProgram(program_);
+ }
+ Program(const Program &other):
+ length_(other.length_),
+ source_(other.source_),
+ source_ptr_(other.source_ptr_),
+ program_(other.program_) {
+ clRetainProgram(program_);
+ }
+ Program& operator=(Program other) {
+ swap(*this, other);
+ return *this;
+ }
+ /*
+ TODO: Implement move construction/assignment?
+ Program(Program &&other) {
+ clRetainProgram(program_);
+ swap(*this, other);
+ }
+ Program& operator=(Program &&other) {
+ swap(*this, other);
+ return *this;
+ }*/
+ friend void swap(Program &first, Program &second) {
+ std::swap(first.length_, second.length_);
+ std::swap(first.source_, second.source_);
+ std::swap(first.source_ptr_, second.source_ptr_);
+ std::swap(first.program_, second.program_);
+ }
+
+ // Public functions
+ cl_int Build(const Device &device, const std::string &options) {
+ const cl_device_id dev = device();
+ return clBuildProgram(program_, 1, &dev, options.c_str(), nullptr, nullptr);
+ }
+ std::string GetBuildInfo(const Device &device) const {
+ auto bytes = size_t{0};
+ clGetProgramBuildInfo(program_, device(), CL_PROGRAM_BUILD_LOG, 0, nullptr, &bytes);
+ auto result = std::vector<char>(bytes);
+ clGetProgramBuildInfo(program_, device(), CL_PROGRAM_BUILD_LOG, bytes, result.data(), nullptr);
+ return std::string(result.data());
+ }
+
+ // Accessors to the private data-member
+ cl_program operator()() const { return program_; }
+ cl_program& operator()() { return program_; }
+ private:
+ size_t length_;
+ std::vector<char> source_;
+ const char* source_ptr_;
+ cl_program program_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_kernel
+class Kernel: public ObjectWithState {
+ public:
+
+ // Constructor based on the plain C data-type
+ explicit Kernel(const cl_kernel kernel): kernel_(kernel) {
+ clRetainKernel(kernel_);
+ }
+
+ // Memory management
+ explicit Kernel(const Program &program, const std::string &name) {
+ auto status = CL_SUCCESS;
+ kernel_ = clCreateKernel(program(), name.c_str(), &status);
+ if (status != CL_SUCCESS) { Error(status); }
+ }
+ ~Kernel() {
+ clReleaseKernel(kernel_);
+ }
+ Kernel(const Kernel &other):
+ kernel_(other.kernel_) {
+ clRetainKernel(kernel_);
+ }
+ Kernel& operator=(Kernel other) {
+ swap(*this, other);
+ return *this;
+ }
+ friend void swap(Kernel &first, Kernel &second) {
+ std::swap(first.kernel_, second.kernel_);
+ }
+
+ // Public functions
+ template <typename T> // Note: doesn't work with T=Buffer
+ cl_int SetArgument(const cl_uint index, const T &value) {
+ return clSetKernelArg(kernel_, index, sizeof(T), &value);
+ }
+ size_t LocalMemUsage(const Device &device) const {
+ auto bytes = size_t{0};
+ clGetKernelWorkGroupInfo(kernel_, device(), CL_KERNEL_LOCAL_MEM_SIZE, 0, nullptr, &bytes);
+ auto result = size_t{0};
+ clGetKernelWorkGroupInfo(kernel_, device(), CL_KERNEL_LOCAL_MEM_SIZE, bytes, &result, nullptr);
+ return result;
+ }
+
+ // Accessors to the private data-member
+ cl_kernel operator()() const { return kernel_; }
+ cl_kernel& operator()() { return kernel_; }
+ private:
+ cl_kernel kernel_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_command_queue
+class CommandQueue: public ObjectWithState {
+ public:
+
+ // Constructor based on the plain C data-type
+ explicit CommandQueue(const cl_command_queue queue): queue_(queue) {
+ clRetainCommandQueue(queue_);
+ }
+
+ // Memory management
+ explicit CommandQueue(const Context &context, const Device &device) {
+ auto status = CL_SUCCESS;
+ queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+ if (status != CL_SUCCESS) { Error(status); }
+ }
+ ~CommandQueue() {
+ clReleaseCommandQueue(queue_);
+ }
+ CommandQueue(const CommandQueue &other):
+ queue_(other.queue_) {
+ clRetainCommandQueue(queue_);
+ }
+ CommandQueue& operator=(CommandQueue other) {
+ swap(*this, other);
+ return *this;
+ }
+ friend void swap(CommandQueue &first, CommandQueue &second) {
+ std::swap(first.queue_, second.queue_);
+ }
+
+ // Public functions
+ cl_int EnqueueKernel(const Kernel &kernel, const std::vector<size_t> &global,
+ const std::vector<size_t> &local, Event &event) {
+ return clEnqueueNDRangeKernel(queue_, kernel(), static_cast<cl_uint>(global.size()), nullptr,
+ global.data(), local.data(), 0, nullptr, &(event()));
+ }
+ Context GetContext() const {
+ auto bytes = size_t{0};
+ clGetCommandQueueInfo(queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes);
+ cl_context result;
+ clGetCommandQueueInfo(queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr);
+ return Context(result);
+ }
+ Device GetDevice() const {
+ auto bytes = size_t{0};
+ clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes);
+ cl_device_id result;
+ clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr);
+ return Device(result);
+ }
+ cl_int Finish() {
+ return clFinish(queue_);
+ }
+
+ // Accessors to the private data-member
+ cl_command_queue operator()() const { return queue_; }
+ cl_command_queue& operator()() { return queue_; }
+ private:
+ cl_command_queue queue_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_mem
+class Buffer: public ObjectWithState {
+ public:
+
+ // Constructor based on the plain C data-type
+ explicit Buffer(const cl_mem buffer): buffer_(buffer) {
+ clRetainMemObject(buffer_);
+ }
+
+ // Memory management
+ explicit Buffer(const Context &context, const cl_mem_flags flags, const size_t bytes) {
+ auto status = CL_SUCCESS;
+ buffer_ = clCreateBuffer(context(), flags, bytes, nullptr, &status);
+ if (status != CL_SUCCESS) { Error(status); }
+ }
+ ~Buffer() {
+ clReleaseMemObject(buffer_);
+ }
+ Buffer(const Buffer &other):
+ buffer_(other.buffer_) {
+ clRetainMemObject(buffer_);
+ }
+ Buffer& operator=(Buffer other) {
+ swap(*this, other);
+ return *this;
+ }
+ friend void swap(Buffer &first, Buffer &second) {
+ std::swap(first.buffer_, second.buffer_);
+ }
+
+ // Public functions
+ template <typename T>
+ cl_int ReadBuffer(const CommandQueue &queue, const size_t bytes, T* host) {
+ return clEnqueueReadBuffer(queue(), buffer_, CL_TRUE, 0, bytes, host, 0, nullptr, nullptr);
+ }
+ template <typename T>
+ cl_int ReadBuffer(const CommandQueue &queue, const size_t bytes, std::vector<T> &host) {
+ return ReadBuffer(queue, bytes, host.data());
+ }
+ template <typename T>
+ cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, const T* host) {
+ return clEnqueueWriteBuffer(queue(), buffer_, CL_TRUE, 0, bytes, host, 0, nullptr, nullptr);
+ }
+ template <typename T>
+ cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, const std::vector<T> &host) {
+ return WriteBuffer(queue, bytes, &host[0]);
+ }
+ size_t GetSize() const {
+ auto bytes = size_t{0};
+ auto status = clGetMemObjectInfo(buffer_, CL_MEM_SIZE, 0, nullptr, &bytes);
+ if (status != CL_SUCCESS) { Error(status); }
+ auto result = size_t{0};
+ status = clGetMemObjectInfo(buffer_, CL_MEM_SIZE, bytes, &result, nullptr);
+ if (status != CL_SUCCESS) { Error(status); }
+ return result;
+ }
+
+ // Accessors to the private data-member
+ cl_mem operator()() const { return buffer_; }
+ cl_mem& operator()() { return buffer_; }
+ private:
+ cl_mem buffer_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_CLPP11_H_
+#endif
diff --git a/include/internal/database.h b/include/internal/database.h
new file mode 100644
index 00000000..dbbdd5c0
--- /dev/null
+++ b/include/internal/database.h
@@ -0,0 +1,90 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Database class, providing a static variable holding the actual database
+// information. The class also provides utility functions to search the database and to access a
+// found entry by parameter-key. The database itself is filled in the corresponding source-file and
+// partially also by the database/xxxxx.h files, in which kernel-specific parameters are found.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_DATABASE_H_
+#define CLBLAST_DATABASE_H_
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include "internal/utilities.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+class Database {
+ public:
+
+ // Type alias for the database parameters
+ using Parameters = std::unordered_map<std::string,size_t>;
+
+ // Structures for content inside the database
+ struct DatabaseDevice {
+ const std::string name;
+ const Parameters parameters;
+ };
+ struct DatabaseVendor {
+ const cl_device_type type;
+ const std::string name;
+ const std::vector<DatabaseDevice> devices;
+ };
+ struct DatabaseEntry {
+ const std::string kernel;
+ const Precision precision;
+ const std::vector<DatabaseVendor> vendors;
+ };
+
+ // The default vendor or device
+ static constexpr auto kDefault = "Default";
+
+ // The database consists of separate database entries, stored together in a vector
+ static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
+ static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
+ static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
+ static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
+ static const DatabaseEntry TraSingle, TraDouble, TraComplexSingle, TraComplexDouble;
+ static const DatabaseEntry PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble;
+ static const std::vector<DatabaseEntry> database;
+
+ // The constructor
+ explicit Database(const CommandQueue &queue, const std::vector<std::string> &routines,
+ const Precision precision);
+
+ // Accessor of values by key
+ size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
+
+ // Obtain a list of OpenCL pre-processor defines based on the parameters
+ std::string GetDefines() const;
+
+ private:
+ Parameters Search(const std::string &this_kernel, const cl_device_type this_type,
+ const std::string &this_vendor, const std::string &this_device,
+ const Precision this_precision) const;
+
+ // Tests equality between a database-vendor string and an OpenCL vendor string
+ bool VendorEqual(const std::string &db_vendor, const std::string &cl_vendor) const;
+
+ // Found parameters suitable for this device/kernel
+ Parameters parameters_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_DATABASE_H_
+#endif
diff --git a/include/internal/database/copy.h b/include/internal/database/copy.h
new file mode 100644
index 00000000..b9335fc9
--- /dev/null
+++ b/include/internal/database/copy.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Copy kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopySingle = {
+ "Copy", Precision::kSingle, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
+ { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_WPT",2}, {"COPY_VW",4} } },
+ { "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",4}, {"COPY_VW",4} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ { "Iris", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",4} } },
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyDouble = {
+ "Copy", Precision::kDouble, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+ { "Tesla K20m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
+ { "Tesla K40m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyComplexSingle = {
+ "Copy", Precision::kComplexSingle, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+ { "Tesla K20m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",1} } },
+ { "Tesla K40m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ { "Iris", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyComplexDouble = {
+ "Copy", Precision::kComplexDouble, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+ { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+ { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/pad.h b/include/internal/database/pad.h
new file mode 100644
index 00000000..5af75308
--- /dev/null
+++ b/include/internal/database/pad.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Pad kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadSingle = {
+ "Pad", Precision::kSingle, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+ { "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadDouble = {
+ "Pad", Precision::kDouble, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadComplexSingle = {
+ "Pad", Precision::kComplexSingle, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadComplexDouble = {
+ "Pad", Precision::kComplexDouble, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/padtranspose.h b/include/internal/database/padtranspose.h
new file mode 100644
index 00000000..f1127d60
--- /dev/null
+++ b/include/internal/database/padtranspose.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the PadTranspose kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadTraSingle = {
+ "PadTranspose", Precision::kSingle, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+ { "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+ { "Tesla K40m", { {"PADTRA_TILE",32}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ { "Iris", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadTraDouble = {
+ "PadTranspose", Precision::kDouble, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+ { "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+ { "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadTraComplexSingle = {
+ "PadTranspose", Precision::kComplexSingle, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+ { "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+ { "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ { "Iris", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadTraComplexDouble = {
+ "PadTranspose", Precision::kComplexDouble, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+ { "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+ { "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/transpose.h b/include/internal/database/transpose.h
new file mode 100644
index 00000000..0814eb8a
--- /dev/null
+++ b/include/internal/database/transpose.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Transpose kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TraSingle = {
+ "Transpose", Precision::kSingle, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+ { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+ { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ { "Iris", { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0} } },
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TraDouble = {
+ "Transpose", Precision::kDouble, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+ { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+ { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TraComplexSingle = {
+ "Transpose", Precision::kComplexSingle, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+ { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+ { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ { "Iris", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TraComplexDouble = {
+ "Transpose", Precision::kComplexDouble, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+ { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+ { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/xaxpy.h b/include/internal/database/xaxpy.h
new file mode 100644
index 00000000..c331945a
--- /dev/null
+++ b/include/internal/database/xaxpy.h
@@ -0,0 +1,129 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Xaxpy kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpySingle = {
+ "Xaxpy", Precision::kSingle, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"WGS",128}, {"WPT",1}, {"VW",2} } },
+ { "Tesla K20m", { {"WGS",128}, {"WPT",2}, {"VW",2} } },
+ { "Tesla K40m", { {"WGS",128}, {"WPT",1}, {"VW",4} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",2} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ { "Iris", { {"WGS",512}, {"WPT",1}, {"VW",1} } },
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyDouble = {
+ "Xaxpy", Precision::kDouble, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+ { "Tesla K20m", { {"WGS",512}, {"WPT",1}, {"VW",2} } },
+ { "Tesla K40m", { {"WGS",64}, {"WPT",1}, {"VW",2} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+ }
+ },
+ }
+};
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyComplexSingle = {
+ "Xaxpy", Precision::kComplexSingle, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+ { "Tesla K20m", { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+ { "Tesla K40m", { {"WGS",128}, {"WPT",2}, {"VW",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ { "Iris", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyComplexDouble = {
+ "Xaxpy", Precision::kComplexDouble, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"WGS",128}, {"WPT",2}, {"VW",1} } },
+ { "Tesla K20m", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+ { "Tesla K40m", { {"WGS",64}, {"WPT",2}, {"VW",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h
new file mode 100644
index 00000000..edf41e12
--- /dev/null
+++ b/include/internal/database/xgemm.h
@@ -0,0 +1,133 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Xgemm kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmSingle = {
+ "Xgemm", Precision::kSingle, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"MWG",128}, {"NWG",64}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+ { "Tesla K20m", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",4}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+ { "Tesla K40m", { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+ { kDefault, { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",8}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ { "Iris", { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDouble = {
+ "Xgemm", Precision::kDouble, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+ { "Tesla K20m", { {"MWG",64}, {"NWG",128}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",32}, {"KWI",8}, {"VWM",2}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+ { "Tesla K40m", { {"MWG",64}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+ { kDefault, { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmComplexSingle = {
+ "Xgemm", Precision::kComplexSingle, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+ { "Tesla K20m", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",8}, {"KWI",8}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+ { "Tesla K40m", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",0}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+ { kDefault, { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ { "Iris", { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmComplexDouble = {
+ "Xgemm", Precision::kComplexDouble, {
+ { // NVIDIA GPUs
+ CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+ { "GeForce GTX 480", { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
+ { "Tesla K20m", { {"MWG",16}, {"NWG",128}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",8}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
+ { "Tesla K40m", { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",1} } },
+ { kDefault, { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
+ }
+ },
+ { // AMD GPUs
+ CL_DEVICE_TYPE_GPU, "AMD", {
+ { "Tahiti", { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+ }
+ },
+ { // Intel GPUs
+ CL_DEVICE_TYPE_GPU, "Intel", {
+ }
+ },
+ { // Default
+ CL_DEVICE_TYPE_ALL, kDefault, {
+ { kDefault, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+ }
+ },
+ }
+};
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/routine.h b/include/internal/routine.h
new file mode 100644
index 00000000..42357d8d
--- /dev/null
+++ b/include/internal/routine.h
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements all the basic functionality for the BLAS routines. This class serves as a
+// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
+// compiling the OpenCL kernel, connecting to the database, etc.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINE_H_
+#define CLBLAST_ROUTINE_H_
+
+#include <string>
+#include <vector>
+
+#include "internal/utilities.h"
+#include "internal/database.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+class Routine {
+ public:
+
+ // Khronos OpenCL extensions
+ const std::string kKhronosHalfPrecision = "cl_khr_fp16";
+ const std::string kKhronosDoublePrecision = "cl_khr_fp64";
+
+ // New data-type:tThe cache of compiled OpenCL programs, along with some meta-data
+ struct ProgramCache {
+ Program program;
+ std::string device_name;
+ Precision precision;
+ std::vector<std::string> routines;
+
+ // Finds out whether the properties match
+ bool MatchInCache(const std::string &ref_name, const Precision &ref_precision,
+ const std::vector<std::string> &ref_routines) {
+ auto ref_size = ref_routines.size();
+ if (device_name == ref_name && precision == ref_precision && routines.size() == ref_size) {
+ auto found_match = true;
+ for (auto i=size_t{0}; i<ref_size; ++i) {
+ if (routines[i] != ref_routines[i]) { found_match = false; }
+ }
+ return found_match;
+ }
+ return false;
+ }
+ };
+
+ // The actual cache, implemented as a vector of the above data-type
+ static std::vector<ProgramCache> program_cache_;
+
+ // Helper functions which check for errors in the status code
+ static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
+
+ // Base class constructor
+ explicit Routine(CommandQueue &queue, Event &event,
+ const std::vector<std::string> &routines, const Precision precision);
+
+ // Set-up phase of the kernel
+ StatusCode SetUp(const std::string &routine_source);
+
+ protected:
+
+ // Runs a kernel given the global and local thread sizes
+ StatusCode RunKernel(const Kernel &kernel, std::vector<size_t> &global,
+ const std::vector<size_t> &local);
+
+ // Tests for valid inputs of matrices A, B, and C
+ StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer &buffer,
+ const size_t offset, const size_t ld, const size_t data_size);
+ StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer &buffer,
+ const size_t offset, const size_t ld, const size_t data_size);
+ StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer &buffer,
+ const size_t offset, const size_t ld, const size_t data_size);
+
+ // Tests for valid inputs of vectors X and Y
+ StatusCode TestVectorX(const size_t n, const Buffer &buffer, const size_t offset,
+ const size_t inc, const size_t data_size);
+ StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset,
+ const size_t inc, const size_t data_size);
+
+ // Copies/transposes a matrix and padds/unpads it
+ StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
+ const size_t src_ld, const size_t src_offset,
+ const Buffer &src,
+ const size_t dest_one, const size_t dest_two,
+ const size_t dest_ld, const size_t dest_offset,
+ const Buffer &dest,
+ const bool do_transpose, const bool pad,
+ const Program &program);
+
+ // Queries the cache and retrieve either a matching program or a boolean whether a match exists.
+ // The first assumes that the program is available in the cache and will throw an exception
+ // otherwise.
+ Program GetProgramFromCache() const;
+ bool ProgramIsInCache() const;
+
+ // Non-static variable for the precision. Note that the same variable (but static) might exist in
+ // a derived class.
+ const Precision precision_;
+
+ // The OpenCL objects, accessible only from derived classes
+ CommandQueue queue_;
+ Event event_;
+ const Context context_;
+ const Device device_;
+
+ // OpenCL device properties
+ const std::string device_name_;
+ const cl_uint max_work_item_dimensions_;
+ const std::vector<size_t> max_work_item_sizes_;
+ const size_t max_work_group_size_;
+
+ // Connection to the database for all the device-specific parameters
+ const Database db_;
+ const std::vector<std::string> routines_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINE_H_
+#endif
diff --git a/include/internal/routines/xaxpy.h b/include/internal/routines/xaxpy.h
new file mode 100644
index 00000000..e548e553
--- /dev/null
+++ b/include/internal/routines/xaxpy.h
@@ -0,0 +1,42 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAXPY_H_
+#define CLBLAST_ROUTINES_XAXPY_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xaxpy: public Routine {
+ public:
+ Xaxpy(CommandQueue &queue, Event &event);
+
+ // Templated-precision implementation of the routine
+ StatusCode DoAxpy(const size_t n, const T alpha,
+ const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer &y_buffer, const size_t y_offset, const size_t y_inc);
+
+ private:
+ // Static variable to get the precision
+ const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAXPY_H_
+#endif
diff --git a/include/internal/routines/xgemm.h b/include/internal/routines/xgemm.h
new file mode 100644
index 00000000..7ad4fcfb
--- /dev/null
+++ b/include/internal/routines/xgemm.h
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGEMM_H_
+#define CLBLAST_ROUTINES_XGEMM_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgemm: public Routine {
+ public:
+ Xgemm(CommandQueue &queue, Event &event);
+
+ // Templated-precision implementation of the routine
+ StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+ // Static variable to get the precision
+ const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGEMM_H_
+#endif
diff --git a/include/internal/routines/xsymm.h b/include/internal/routines/xsymm.h
new file mode 100644
index 00000000..c6545164
--- /dev/null
+++ b/include/internal/routines/xsymm.h
@@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm routine. It is based on the generalized matrix multiplication
+// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the
+// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by
+// transforming it into a general matrix, and then calls the regular GEMM code.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYMM_H_
+#define CLBLAST_ROUTINES_XSYMM_H_
+
+#include "internal/routines/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsymm: public Xgemm<T> {
+ public:
+
+ // Uses several variables from the Routine class
+ using Routine::db_;
+ using Routine::context_;
+
+ // Uses several helper functions from the Routine class
+ using Routine::RunKernel;
+ using Routine::ErrorIn;
+ using Routine::TestMatrixA;
+ using Routine::GetProgramFromCache;
+
+ // Uses the regular Xgemm routine
+ using Xgemm<T>::DoGemm;
+
+ // Constructor
+ Xsymm(CommandQueue &queue, Event &event);
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYMM_H_
+#endif
diff --git a/include/internal/tuning.h b/include/internal/tuning.h
new file mode 100644
index 00000000..7768888c
--- /dev/null
+++ b/include/internal/tuning.h
@@ -0,0 +1,53 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the header for the tuner functions. This is only used for the optional
+// and stand-alone tuner binaries and not part of the core of CLBlast. The convention used here is
+// that X and Y are vectors, while A, B, and C are matrices.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TUNING_H_
+#define CLBLAST_TUNING_H_
+
+#include <vector>
+#include <functional>
+
+#include <cltune.h>
+
+namespace clblast {
+// =================================================================================================
+
+// Functions with two or three OpenCL memory buffers
+template <typename T>
+using Tuner2 = std::function<void(const Arguments<T>&,
+ const std::vector<T>&, std::vector<T>&,
+ cltune::Tuner&)>;
+template <typename T>
+using Tuner3 = std::function<void(const Arguments<T>&,
+ const std::vector<T>&, const std::vector<T>&, std::vector<T>&,
+ cltune::Tuner&)>;
+
+// Tuner for vector-vector input
+template <typename T>
+void TunerXY(int argc, char* argv[], const Tuner2<T> &tune_function);
+
+// Tuner for matrix-matrix input
+template <typename T>
+void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function);
+
+// Tuner for matrix-matrix-matrix input
+template <typename T>
+void TunerABC(int argc, char* argv[], const Tuner3<T> &tune_function);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TUNING_H_
+#endif
diff --git a/include/internal/utilities.h b/include/internal/utilities.h
new file mode 100644
index 00000000..af04dfdb
--- /dev/null
+++ b/include/internal/utilities.h
@@ -0,0 +1,174 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides declarations for the common (test) utility functions such as a command-line
+// argument parser. On top of this, it serves as the 'common' header, including the C++ OpenCL
+// wrapper. These utilities are not only used for CLBlast, but also included as part of the tuners,
+// the performance client and the correctness testers.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_UTILITIES_H_
+#define CLBLAST_UTILITIES_H_
+
+#include <string>
+#include <functional>
+#include <complex>
+
+#include "clblast.h"
+#include "internal/clpp11.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Shorthands for complex data-types
+using float2 = std::complex<float>;
+using double2 = std::complex<double>;
+
+// =================================================================================================
+
+// The routine-specific arguments in string form
+constexpr auto kArgM = "m";
+constexpr auto kArgN = "n";
+constexpr auto kArgK = "k";
+constexpr auto kArgLayout = "layout";
+constexpr auto kArgATransp = "transA";
+constexpr auto kArgBTransp = "transB";
+constexpr auto kArgSide = "side";
+constexpr auto kArgTriangle = "triangle";
+constexpr auto kArgXInc = "incx";
+constexpr auto kArgYInc = "incy";
+constexpr auto kArgXOffset = "offx";
+constexpr auto kArgYOffset = "offy";
+constexpr auto kArgALeadDim = "lda";
+constexpr auto kArgBLeadDim = "ldb";
+constexpr auto kArgCLeadDim = "ldc";
+constexpr auto kArgAOffset = "offa";
+constexpr auto kArgBOffset = "offb";
+constexpr auto kArgCOffset = "offc";
+constexpr auto kArgAlpha = "alpha";
+constexpr auto kArgBeta = "beta";
+
+// The tuner-specific arguments in string form
+constexpr auto kArgFraction = "fraction";
+
+// The client-specific arguments in string form
+constexpr auto kArgCompareclblas = "clblas";
+constexpr auto kArgStepSize = "step";
+constexpr auto kArgNumSteps = "num_steps";
+constexpr auto kArgNumRuns = "runs";
+
+// The common arguments in string form
+constexpr auto kArgPlatform = "platform";
+constexpr auto kArgDevice = "device";
+constexpr auto kArgPrecision = "precision";
+constexpr auto kArgHelp = "h";
+constexpr auto kArgQuiet = "q";
+constexpr auto kArgNoAbbreviations = "no_abbrv";
+
+// =================================================================================================
+
+// Structure containing all possible arguments for test clients, including their default values
+template <typename T>
+struct Arguments {
+ // Routine-specific arguments
+ size_t m = 0;
+ size_t n = 0;
+ size_t k = 0;
+ Layout layout = Layout::kRowMajor;
+ Transpose a_transpose = Transpose::kNo;
+ Transpose b_transpose = Transpose::kNo;
+ Side side = Side::kLeft;
+ Triangle triangle = Triangle::kUpper;
+ size_t x_inc = 1;
+ size_t y_inc = 1;
+ size_t x_offset = 0;
+ size_t y_offset = 0;
+ size_t a_ld = 0;
+ size_t b_ld = 0;
+ size_t c_ld = 0;
+ size_t a_offset = 0;
+ size_t b_offset = 0;
+ size_t c_offset = 0;
+ T alpha = T{1.0};
+ T beta = T{1.0};
+ // Tuner-specific arguments
+ double fraction = 1.0;
+ // Client-specific arguments
+ bool compare_clblas = 1;
+ size_t step = 1;
+ size_t num_steps = 0;
+ size_t num_runs = 10;
+ // Common arguments
+ size_t platform_id = 0;
+ size_t device_id = 0;
+ Precision precision = Precision::kSingle;
+ bool print_help = false;
+ bool silent = false;
+ bool no_abbrv = false;
+};
+
+// =================================================================================================
+
+// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
+// data-types such as the Layout and Transpose data-types.
+template <typename T>
+std::string ToString(T value);
+
+// =================================================================================================
+
+// Helper for the function "GetArgument"
+template <typename T>
+T ConvertArgument(const char* value);
+
+// Basic argument parser, matching patterns in the form of "-option value" and "--option value"
+template <typename T>
+T GetArgument(const int argc, char *argv[], std::string &help,
+ const std::string &option, const T default_value);
+
+// Returns the precision only
+Precision GetPrecision(const int argc, char *argv[]);
+
+// As in "GetArgument", but now only checks whether an argument is given or not
+bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option);
+
+// =================================================================================================
+
+// Returns a random number to be used as a seed
+unsigned int GetRandomSeed();
+
+// Populates a vector with random data
+template <typename T>
+void PopulateVector(std::vector<T> &vector);
+
+// =================================================================================================
+
+// Returns a scalar with a default value
+template <typename T>
+T GetScalar();
+
+// =================================================================================================
+
+// Rounding functions
+size_t CeilDiv(const size_t x, const size_t y);
+size_t Ceil(const size_t x, const size_t y);
+
+// Returns whether or not 'a' is a multiple of 'b'
+bool IsMultiple(const size_t a, const size_t b);
+
+// =================================================================================================
+
+// Convert the precision enum into bytes, e.g. a double takes up 8 bytes
+size_t GetBytes(const Precision precision);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_UTILITIES_H_
+#endif