Initial commit of preview version

author: CNugteren <web@cedricnugteren.nl> 2015-05-30 12:30:43 +0200
committer: CNugteren <web@cedricnugteren.nl> 2015-05-30 12:30:43 +0200
commit: bc5a341dfe591946e925db315fc7d8c0c25c2938 (patch)
tree: b216ab5eee4863e3807d92b5ddd19fa22197ed22 /include
parent: c7b054ea6747039f4405fd93da6e924f3e5c7f4b (diff)
15 files changed, 2028 insertions, 0 deletions
diff --git a/include/clblast.h b/include/clblast.h
new file mode 100644
index 00000000..4c3c5201
--- /dev/null
+++ b/include/clblast.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the interface to the CLBlast BLAS routines. It also contains the definitions
+// of the returned status codes and the layout and transpose types. This is the only header users
+// of CLBlast should include and use.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLBLAST_H_
+#define CLBLAST_CLBLAST_H_
+
+#include <cstdlib> // For size_t
+
+// Includes the normal OpenCL C header
+#if defined(__APPLE__) || defined(__MACOSX)
+  #include <OpenCL/opencl.h>
+#else
+  #include <CL/opencl.h>
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// Status codes. These codes can be returned by functions declared in this header file. The error
+// codes match either the standard OpenCL error codes or the clBLAS error codes. 
+enum class StatusCode {
+
+  // Status codes in common with the OpenCL standard
+  kSuccess                   =   0, // CL_SUCCESS
+  kTempBufferAllocFailure    =  -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
+  kBuildProgramFailure       = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
+  kInvalidBinary             = -42, // CL_INVALID_BINARY
+  kInvalidKernel             = -48, // CL_INVALID_KERNEL
+  kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
+  kInvalidLocalThreadsTotal  = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
+  kInvalidLocalThreadsDim    = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
+  kInvalidTempBufferSize     = -61, // CL_INVALID_BUFFER_SIZE
+
+  // Status codes in common with the clBLAS library
+  kNotImplemented            = -1024, // Routine or functionality not implemented yet
+  kInvalidMatrixA            = -1022, // Matrix A is not a valid OpenCL buffer
+  kInvalidMatrixB            = -1021, // Matrix B is not a valid OpenCL buffer
+  kInvalidMatrixC            = -1020, // Matrix C is not a valid OpenCL buffer
+  kInvalidVectorX            = -1019, // Vector X is not a valid OpenCL buffer
+  kInvalidVectorY            = -1018, // Vector Y is not a valid OpenCL buffer
+  kInvalidDimension          = -1017, // Dimensions M, N, and K have to be larger than zero
+  kInvalidLeadDimA           = -1016, // LD of A is smaller than the matrix's first dimension
+  kInvalidLeadDimB           = -1015, // LD of B is smaller than the matrix's first dimension
+  kInvalidLeadDimC           = -1014, // LD of C is smaller than the matrix's first dimension
+  kInvalidIncrementX         = -1013, // Increment of vector X cannot be zero
+  kInvalidIncrementY         = -1012, // Increment of vector Y cannot be zero
+  kInsufficientMemoryA       = -1011, // Matrix A's OpenCL buffer is too small
+  kInsufficientMemoryB       = -1010, // Matrix B's OpenCL buffer is too small
+  kInsufficientMemoryC       = -1009, // Matrix C's OpenCL buffer is too small
+  kInsufficientMemoryX       = -1008, // Vector X's OpenCL buffer is too small
+  kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small
+
+  // Custom additional status codes for CLBlast
+  kKernelLaunchError         = -2048, // Problem occurred when enqueuing the kernel
+  kKernelRunError            = -2047, // Problem occurred while running the kernel
+  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
+  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
+  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
+};
+
+// Matrix layout and transpose types
+enum class Layout { kRowMajor, kColMajor };
+enum class Transpose { kNo, kYes, kConjugate };
+enum class Side { kLeft, kRight };
+enum class Triangle { kUpper, kLower };
+
+// Precision scoped enum (values in bits)
+enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
+                       kComplexSingle = 3232, kComplexDouble = 6464 };
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+
+// Templated-precision vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
+template <typename T>
+StatusCode Axpy(const size_t m, const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event);
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+
+// Templated-precision generalized matrix multiplication: SGEMM/DGEMM
+template <typename T>
+StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
+                const size_t m, const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// Templated-precision symmetric matrix multiplication: SSYMM/DSYMM
+template <typename T>
+StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_CLBLAST_H_
+#endif
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
new file mode 100644
index 00000000..73040fdb
--- /dev/null
+++ b/include/internal/clpp11.h
@@ -0,0 +1,524 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a C++11 wrapper around some OpenCL C data-types, similar to Khronos' cl.hpp.
+// The main differences are modern C++11 support and a straightforward implemenation of the basic
+// needs (as required for this project). It also includes some extra functionality not available
+// in cl.hpp, such as including the sources with a Program object and querying a Kernel's validity
+// in terms of local memory usage.
+//
+// This file is adapted from the C++ bindings from the CLTune project and therefore contains the
+// following copyright notice:
+//
+// =================================================================================================
+//
+// Copyright 2014 SURFsara
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//  http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLPP11_H_
+#define CLBLAST_CLPP11_H_
+
+#include <utility> // std::swap
+#include <algorithm> // std::copy
+#include <string> // std::string
+#include <vector> // std::vector
+#include <stdexcept> // std::runtime_error
+
+// Includes the normal OpenCL C header
+#if defined(__APPLE__) || defined(__MACOSX)
+  #include <OpenCL/opencl.h>
+#else
+  #include <CL/opencl.h>
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// Base class for any object
+class Object {
+ protected:
+
+  // Error handling (NOTE: these functions are [[noreturn]])
+  void Error(const std::string &message) const {
+    throw std::runtime_error("Internal OpenCL error: "+message);
+  }
+  void Error(const cl_int status) const {
+    throw std::runtime_error("Internal OpenCL error with status: "+std::to_string(status));
+  }
+};
+
+// =================================================================================================
+
+// Base class for objects which require memory management
+class ObjectWithState: public Object {
+
+};
+
+// =================================================================================================
+
+// C++11 version of cl_event
+class Event: public Object {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit Event(const cl_event event): event_(event) { }
+
+  // New event
+  Event(): event_() {}
+
+  // Public functions
+  size_t GetProfilingStart() const {
+    auto bytes = size_t{0};
+    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
+    auto result = size_t{0};
+    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &result, nullptr);
+    return result;
+  }
+  size_t GetProfilingEnd() const {
+    auto bytes = size_t{0};
+    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
+    auto result = size_t{0};
+    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &result, nullptr);
+    return result;
+  }
+  cl_int Wait() const {
+    return clWaitForEvents(1, &event_);
+  }
+
+  // Accessors to the private data-member
+  cl_event operator()() const { return event_; }
+  cl_event& operator()() { return event_; }
+ private:
+  cl_event event_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_platform_id
+class Platform: public Object {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit Platform(const cl_platform_id platform): platform_(platform) { }
+
+  // Initialize the platform. Note that this constructor can throw exceptions!
+  explicit Platform(const size_t platform_id) {
+    auto num_platforms = cl_uint{0};
+    auto status = clGetPlatformIDs(0, nullptr, &num_platforms);
+    if (status != CL_SUCCESS) { Error(status); }
+    if (num_platforms == 0) { Error("no platforms found"); }
+    auto platforms = std::vector<cl_platform_id>(num_platforms);
+    status = clGetPlatformIDs(num_platforms, platforms.data(), nullptr);
+    if (status != CL_SUCCESS) { Error(status); }
+    if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
+    platform_ = platforms[platform_id];
+  }
+
+  // Accessors to the private data-member
+  cl_platform_id operator()() const { return platform_; }
+  cl_platform_id& operator()() { return platform_; }
+ private:
+  cl_platform_id platform_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_device_id
+class Device: public Object {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit Device(const cl_device_id device): device_(device) { }
+
+  // Initialize the device. Note that this constructor can throw exceptions!
+  explicit Device(const Platform &platform, const cl_device_type type, const size_t device_id) {
+    auto num_devices = cl_uint{0};
+    auto status = clGetDeviceIDs(platform(), type, 0, nullptr, &num_devices);
+    if (status != CL_SUCCESS) { Error(status); }
+    if (num_devices == 0) { Error("no devices found"); }
+    auto devices = std::vector<cl_device_id>(num_devices);
+    status = clGetDeviceIDs(platform(), type, num_devices, devices.data(), nullptr);
+    if (status != CL_SUCCESS) { Error(status); }
+    if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
+    device_ = devices[device_id];
+  }
+
+  // Public functions
+  std::string Version()     const { return GetInfoString(CL_DEVICE_VERSION); }
+  cl_device_type Type()     const { return GetInfo<cl_device_type>(CL_DEVICE_TYPE); }
+  std::string Vendor()      const { return GetInfoString(CL_DEVICE_VENDOR); }
+  std::string Name()        const { return GetInfoString(CL_DEVICE_NAME); }
+  std::string Extensions()  const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
+  size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
+  cl_ulong LocalMemSize()   const { return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE); }
+  cl_uint MaxWorkItemDimensions() const {
+    return GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
+  }
+  std::vector<size_t> MaxWorkItemSizes() const {
+    return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
+  }
+
+  // Configuration-validity checks
+  bool IsLocalMemoryValid(const size_t local_mem_usage) const {
+    return (local_mem_usage <= LocalMemSize());
+  }
+  bool IsThreadConfigValid(const std::vector<size_t> &local) const {
+    auto local_size = size_t{1};
+    for (auto &item: local) { local_size *= item; }
+    for (auto i=size_t{0}; i<local.size(); ++i) {
+      if (local[i] > MaxWorkItemSizes()[i]) { return false; }
+    }
+    if (local_size > MaxWorkGroupSize()) { return false; }
+    if (local.size() > MaxWorkItemDimensions()) { return false; }
+    return true;
+  }
+
+  // Accessors to the private data-member
+  cl_device_id operator()() const { return device_; }
+  cl_device_id& operator()() { return device_; }
+ private:
+
+  // Helper functions
+  template <typename T>
+  T GetInfo(const cl_device_info info) const {
+    auto bytes = size_t{0};
+    clGetDeviceInfo(device_, info, 0, nullptr, &bytes);
+    auto result = T(0);
+    clGetDeviceInfo(device_, info, bytes, &result, nullptr);
+    return result;
+  }
+  template <typename T>
+  std::vector<T> GetInfoVector(const cl_device_info info) const {
+    auto bytes = size_t{0};
+    clGetDeviceInfo(device_, info, 0, nullptr, &bytes);
+    auto result = std::vector<T>(bytes/sizeof(T));
+    clGetDeviceInfo(device_, info, bytes, result.data(), nullptr);
+    return result;
+  }
+  std::string GetInfoString(const cl_device_info info) const {
+    auto bytes = size_t{0};
+    clGetDeviceInfo(device_, info, 0, nullptr, &bytes);
+    auto result = std::vector<char>(bytes);
+    clGetDeviceInfo(device_, info, bytes, result.data(), nullptr);
+    return std::string(result.data());
+  }
+
+  cl_device_id device_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_context
+class Context: public ObjectWithState {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit Context(const cl_context context): context_(context) {
+    clRetainContext(context_);
+  }
+
+  // Memory management
+  explicit Context(const Device &device) {
+    auto status = CL_SUCCESS;
+    const cl_device_id dev = device();
+    context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
+    if (status != CL_SUCCESS) { Error(status); }
+  }
+  ~Context() {
+    clReleaseContext(context_);
+  }
+  Context(const Context &other):
+    context_(other.context_) {
+    clRetainContext(context_);
+  }
+  Context& operator=(Context other) {
+    swap(*this, other);
+    return *this;
+  }
+  friend void swap(Context &first, Context &second) {
+    std::swap(first.context_, second.context_);
+  }
+
+  // Accessors to the private data-member
+  cl_context operator()() const { return context_; }
+  cl_context& operator()() { return context_; }
+ private:
+  cl_context context_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_program. Additionally holds the program's source code.
+class Program: public ObjectWithState {
+ public:
+
+  // Note that there is no constructor based on the plain C data-type because of extra state
+
+  // Memory management
+  explicit Program(const Context &context, const std::string &source):
+    length_(source.length()) {
+      std::copy(source.begin(), source.end(), back_inserter(source_));
+      source_ptr_ = source_.data();
+      auto status = CL_SUCCESS;
+      program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
+      if (status != CL_SUCCESS) { Error(status); }
+    }
+  ~Program() {
+    clReleaseProgram(program_);
+  }
+  Program(const Program &other):
+      length_(other.length_),
+      source_(other.source_),
+      source_ptr_(other.source_ptr_),
+      program_(other.program_) {
+    clRetainProgram(program_);
+  }
+  Program& operator=(Program other) {
+    swap(*this, other);
+    return *this;
+  }
+  /*
+  TODO: Implement move construction/assignment?
+  Program(Program &&other) {
+    clRetainProgram(program_);
+    swap(*this, other);
+  }
+  Program& operator=(Program &&other) {
+    swap(*this, other);
+    return *this;
+  }*/
+  friend void swap(Program &first, Program &second) {
+    std::swap(first.length_, second.length_);
+    std::swap(first.source_, second.source_);
+    std::swap(first.source_ptr_, second.source_ptr_);
+    std::swap(first.program_, second.program_);
+  }
+
+  // Public functions
+  cl_int Build(const Device &device, const std::string &options) {
+    const cl_device_id dev = device();
+    return clBuildProgram(program_, 1, &dev, options.c_str(), nullptr, nullptr);
+  }
+  std::string GetBuildInfo(const Device &device) const {
+    auto bytes = size_t{0};
+    clGetProgramBuildInfo(program_, device(), CL_PROGRAM_BUILD_LOG, 0, nullptr, &bytes);
+    auto result = std::vector<char>(bytes);
+    clGetProgramBuildInfo(program_, device(), CL_PROGRAM_BUILD_LOG, bytes, result.data(), nullptr);
+    return std::string(result.data());
+  }
+
+  // Accessors to the private data-member
+  cl_program operator()() const { return program_; }
+  cl_program& operator()() { return program_; }
+ private:
+  size_t length_;
+  std::vector<char> source_;
+  const char* source_ptr_;
+  cl_program program_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_kernel
+class Kernel: public ObjectWithState {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit Kernel(const cl_kernel kernel): kernel_(kernel) {
+    clRetainKernel(kernel_);
+  }
+
+  // Memory management
+  explicit Kernel(const Program &program, const std::string &name) {
+    auto status = CL_SUCCESS;
+    kernel_ = clCreateKernel(program(), name.c_str(), &status);
+    if (status != CL_SUCCESS) { Error(status); }
+  }
+  ~Kernel() {
+    clReleaseKernel(kernel_);
+  }
+  Kernel(const Kernel &other):
+    kernel_(other.kernel_) {
+    clRetainKernel(kernel_);
+  }
+  Kernel& operator=(Kernel other) {
+    swap(*this, other);
+    return *this;
+  }
+  friend void swap(Kernel &first, Kernel &second) {
+    std::swap(first.kernel_, second.kernel_);
+  }
+
+  // Public functions
+  template <typename T> // Note: doesn't work with T=Buffer
+  cl_int SetArgument(const cl_uint index, const T &value) {
+    return clSetKernelArg(kernel_, index, sizeof(T), &value);
+  }
+  size_t LocalMemUsage(const Device &device) const {
+    auto bytes = size_t{0};
+    clGetKernelWorkGroupInfo(kernel_, device(), CL_KERNEL_LOCAL_MEM_SIZE, 0, nullptr, &bytes);
+    auto result = size_t{0};
+    clGetKernelWorkGroupInfo(kernel_, device(), CL_KERNEL_LOCAL_MEM_SIZE, bytes, &result, nullptr);
+    return result;
+  }
+
+  // Accessors to the private data-member
+  cl_kernel operator()() const { return kernel_; }
+  cl_kernel& operator()() { return kernel_; }
+ private:
+  cl_kernel kernel_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_command_queue
+class CommandQueue: public ObjectWithState {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit CommandQueue(const cl_command_queue queue): queue_(queue) {
+    clRetainCommandQueue(queue_);
+  }
+
+  // Memory management
+  explicit CommandQueue(const Context &context, const Device &device) {
+    auto status = CL_SUCCESS;
+    queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+    if (status != CL_SUCCESS) { Error(status); }
+  }
+  ~CommandQueue() {
+    clReleaseCommandQueue(queue_);
+  }
+  CommandQueue(const CommandQueue &other):
+    queue_(other.queue_) {
+    clRetainCommandQueue(queue_);
+  }
+  CommandQueue& operator=(CommandQueue other) {
+    swap(*this, other);
+    return *this;
+  }
+  friend void swap(CommandQueue &first, CommandQueue &second) {
+    std::swap(first.queue_, second.queue_);
+  }
+
+  // Public functions
+  cl_int EnqueueKernel(const Kernel &kernel, const std::vector<size_t> &global,
+                       const std::vector<size_t> &local, Event &event) {
+    return clEnqueueNDRangeKernel(queue_, kernel(), static_cast<cl_uint>(global.size()), nullptr,
+                                  global.data(), local.data(), 0, nullptr, &(event()));
+  }
+  Context GetContext() const {
+    auto bytes = size_t{0};
+    clGetCommandQueueInfo(queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes);
+    cl_context result;
+    clGetCommandQueueInfo(queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr);
+    return Context(result);
+  }
+  Device GetDevice() const {
+    auto bytes = size_t{0};
+    clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes);
+    cl_device_id result;
+    clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr);
+    return Device(result);
+  }
+  cl_int Finish() {
+    return clFinish(queue_);
+  }
+
+  // Accessors to the private data-member
+  cl_command_queue operator()() const { return queue_; }
+  cl_command_queue& operator()() { return queue_; }
+ private:
+  cl_command_queue queue_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_mem
+class Buffer: public ObjectWithState {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit Buffer(const cl_mem buffer): buffer_(buffer) {
+    clRetainMemObject(buffer_);
+  }
+
+  // Memory management
+  explicit Buffer(const Context &context, const cl_mem_flags flags, const size_t bytes) {
+    auto status = CL_SUCCESS;
+    buffer_ = clCreateBuffer(context(), flags, bytes, nullptr, &status);
+    if (status != CL_SUCCESS) { Error(status); }
+  }
+  ~Buffer() {
+    clReleaseMemObject(buffer_);
+  }
+  Buffer(const Buffer &other):
+    buffer_(other.buffer_) {
+    clRetainMemObject(buffer_);
+  }
+  Buffer& operator=(Buffer other) {
+    swap(*this, other);
+    return *this;
+  }
+  friend void swap(Buffer &first, Buffer &second) {
+    std::swap(first.buffer_, second.buffer_);
+  }
+
+  // Public functions
+  template <typename T>
+  cl_int ReadBuffer(const CommandQueue &queue, const size_t bytes, T* host) {
+    return clEnqueueReadBuffer(queue(), buffer_, CL_TRUE, 0, bytes, host, 0, nullptr, nullptr);
+  }
+  template <typename T>
+  cl_int ReadBuffer(const CommandQueue &queue, const size_t bytes, std::vector<T> &host) {
+    return ReadBuffer(queue, bytes, host.data());
+  }
+  template <typename T>
+  cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, const T* host) {
+    return clEnqueueWriteBuffer(queue(), buffer_, CL_TRUE, 0, bytes, host, 0, nullptr, nullptr);
+  }
+  template <typename T>
+  cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, const std::vector<T> &host) {
+    return WriteBuffer(queue, bytes, &host[0]);
+  }
+  size_t GetSize() const {
+    auto bytes = size_t{0};
+    auto status = clGetMemObjectInfo(buffer_, CL_MEM_SIZE, 0, nullptr, &bytes);
+    if (status != CL_SUCCESS) { Error(status); }
+    auto result = size_t{0};
+    status = clGetMemObjectInfo(buffer_, CL_MEM_SIZE, bytes, &result, nullptr);
+    if (status != CL_SUCCESS) { Error(status); }
+    return result;
+  }
+
+  // Accessors to the private data-member
+  cl_mem operator()() const { return buffer_; }
+  cl_mem& operator()() { return buffer_; }
+ private:
+  cl_mem buffer_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_CLPP11_H_
+#endif
diff --git a/include/internal/database.h b/include/internal/database.h
new file mode 100644
index 00000000..dbbdd5c0
--- /dev/null
+++ b/include/internal/database.h
@@ -0,0 +1,90 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Database class, providing a static variable holding the actual database
+// information. The class also provides utility functions to search the database and to access a
+// found entry by parameter-key. The database itself is filled in the corresponding source-file and
+// partially also by the database/xxxxx.h files, in which kernel-specific parameters are found.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_DATABASE_H_
+#define CLBLAST_DATABASE_H_
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include "internal/utilities.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+class Database {
+ public:
+
+  // Type alias for the database parameters
+  using Parameters = std::unordered_map<std::string,size_t>;
+
+  // Structures for content inside the database
+  struct DatabaseDevice {
+    const std::string name;
+    const Parameters parameters;
+  };
+  struct DatabaseVendor {
+    const cl_device_type type;
+    const std::string name;
+    const std::vector<DatabaseDevice> devices;
+  };
+  struct DatabaseEntry {
+    const std::string kernel;
+    const Precision precision;
+    const std::vector<DatabaseVendor> vendors;
+  };
+
+  // The default vendor or device
+  static constexpr auto kDefault = "Default";
+
+  // The database consists of separate database entries, stored together in a vector
+  static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
+  static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
+  static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
+  static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
+  static const DatabaseEntry TraSingle, TraDouble, TraComplexSingle, TraComplexDouble;
+  static const DatabaseEntry PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble;
+  static const std::vector<DatabaseEntry> database;
+
+  // The constructor
+  explicit Database(const CommandQueue &queue, const std::vector<std::string> &routines,
+                    const Precision precision);
+
+  // Accessor of values by key
+  size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
+
+  // Obtain a list of OpenCL pre-processor defines based on the parameters
+  std::string GetDefines() const;
+
+ private:
+  Parameters Search(const std::string &this_kernel, const cl_device_type this_type,
+                    const std::string &this_vendor, const std::string &this_device,
+                    const Precision this_precision) const;
+
+  // Tests equality between a database-vendor string and an OpenCL vendor string
+  bool VendorEqual(const std::string &db_vendor, const std::string &cl_vendor) const;
+
+  // Found parameters suitable for this device/kernel
+  Parameters parameters_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_DATABASE_H_
+#endif
diff --git a/include/internal/database/copy.h b/include/internal/database/copy.h
new file mode 100644
index 00000000..b9335fc9
--- /dev/null
+++ b/include/internal/database/copy.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Copy kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopySingle = {
+  "Copy", Precision::kSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
+        { "Tesla K20m",       { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_WPT",2}, {"COPY_VW",4} } },
+        { "Tesla K40m",       { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",4}, {"COPY_VW",4} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",4} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyDouble = {
+  "Copy", Precision::kDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "Tesla K20m",       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
+        { "Tesla K40m",       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyComplexSingle = {
+  "Copy", Precision::kComplexSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "Tesla K20m",       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",1} } },
+        { "Tesla K40m",       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyComplexDouble = {
+  "Copy", Precision::kComplexDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "Tesla K20m",       { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "Tesla K40m",       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/pad.h b/include/internal/database/pad.h
new file mode 100644
index 00000000..5af75308
--- /dev/null
+++ b/include/internal/database/pad.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Pad kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadSingle = {
+  "Pad", Precision::kSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "Tesla K20m",       { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "Tesla K40m",       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadDouble = {
+  "Pad", Precision::kDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K20m",       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K40m",       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadComplexSingle = {
+  "Pad", Precision::kComplexSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K20m",       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Tesla K40m",       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadComplexDouble = {
+  "Pad", Precision::kComplexDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K20m",       { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K40m",       { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/padtranspose.h b/include/internal/database/padtranspose.h
new file mode 100644
index 00000000..f1127d60
--- /dev/null
+++ b/include/internal/database/padtranspose.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the PadTranspose kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadTraSingle = {
+  "PadTranspose", Precision::kSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+        { "Tesla K40m",       { {"PADTRA_TILE",32}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadTraDouble = {
+  "PadTranspose", Precision::kDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tesla K40m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadTraComplexSingle = {
+  "PadTranspose", Precision::kComplexSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tesla K40m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadTraComplexDouble = {
+  "PadTranspose", Precision::kComplexDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tesla K40m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/transpose.h b/include/internal/database/transpose.h
new file mode 100644
index 00000000..0814eb8a
--- /dev/null
+++ b/include/internal/database/transpose.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Transpose kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TraSingle = {
+  "Transpose", Precision::kSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TraDouble = {
+  "Transpose", Precision::kDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TraComplexSingle = {
+  "Transpose", Precision::kComplexSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TraComplexDouble = {
+  "Transpose", Precision::kComplexDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/xaxpy.h b/include/internal/database/xaxpy.h
new file mode 100644
index 00000000..c331945a
--- /dev/null
+++ b/include/internal/database/xaxpy.h
@@ -0,0 +1,129 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Xaxpy kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpySingle = {
+  "Xaxpy", Precision::kSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"WGS",128}, {"WPT",1}, {"VW",2} } },
+        { "Tesla K20m",       { {"WGS",128}, {"WPT",2}, {"VW",2} } },
+        { "Tesla K40m",       { {"WGS",128}, {"WPT",1}, {"VW",4} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",2} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"WGS",512}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyDouble = {
+  "Xaxpy", Precision::kDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+        { "Tesla K20m",       { {"WGS",512}, {"WPT",1}, {"VW",2} } },
+        { "Tesla K40m",       { {"WGS",64}, {"WPT",1}, {"VW",2} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+  }
+};
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyComplexSingle = {
+  "Xaxpy", Precision::kComplexSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+        { "Tesla K20m",       { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+        { "Tesla K40m",       { {"WGS",128}, {"WPT",2}, {"VW",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyComplexDouble = {
+  "Xaxpy", Precision::kComplexDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"WGS",128}, {"WPT",2}, {"VW",1} } },
+        { "Tesla K20m",       { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+        { "Tesla K40m",       { {"WGS",64}, {"WPT",2}, {"VW",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h
new file mode 100644
index 00000000..edf41e12
--- /dev/null
+++ b/include/internal/database/xgemm.h
@@ -0,0 +1,133 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Xgemm kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmSingle = {
+  "Xgemm", Precision::kSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"MWG",128}, {"NWG",64}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { "Tesla K20m",       { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",4}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { "Tesla K40m",       { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { kDefault,           { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",8}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDouble = {
+  "Xgemm", Precision::kDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { "Tesla K20m",       { {"MWG",64}, {"NWG",128}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",32}, {"KWI",8}, {"VWM",2}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+        { "Tesla K40m",       { {"MWG",64}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { kDefault,           { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmComplexSingle = {
+  "Xgemm", Precision::kComplexSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+        { "Tesla K20m",       { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",8}, {"KWI",8}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+        { "Tesla K40m",       { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",0}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+        { kDefault,           { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmComplexDouble = {
+  "Xgemm", Precision::kComplexDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
+        { "Tesla K20m",       { {"MWG",16}, {"NWG",128}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",8}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
+        { "Tesla K40m",       { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",1} } },
+        { kDefault,           { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      }
+    },
+  }
+};
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/routine.h b/include/internal/routine.h
new file mode 100644
index 00000000..42357d8d
--- /dev/null
+++ b/include/internal/routine.h
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements all the basic functionality for the BLAS routines. This class serves as a
+// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
+// compiling the OpenCL kernel, connecting to the database, etc.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINE_H_
+#define CLBLAST_ROUTINE_H_
+
+#include <string>
+#include <vector>
+
+#include "internal/utilities.h"
+#include "internal/database.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+class Routine {
+ public:
+
+  // Khronos OpenCL extensions
+  const std::string kKhronosHalfPrecision = "cl_khr_fp16";
+  const std::string kKhronosDoublePrecision = "cl_khr_fp64";
+
+  // New data-type:tThe cache of compiled OpenCL programs, along with some meta-data
+  struct ProgramCache {
+    Program program;
+    std::string device_name;
+    Precision precision;
+    std::vector<std::string> routines;
+
+    // Finds out whether the properties match
+    bool MatchInCache(const std::string &ref_name, const Precision &ref_precision,
+                      const std::vector<std::string> &ref_routines) {
+      auto ref_size = ref_routines.size();
+      if (device_name == ref_name && precision == ref_precision && routines.size() == ref_size) {
+        auto found_match = true;
+        for (auto i=size_t{0}; i<ref_size; ++i) {
+          if (routines[i] != ref_routines[i]) { found_match = false; }
+        }
+        return found_match;
+      }
+      return false;
+    }
+  };
+
+  // The actual cache, implemented as a vector of the above data-type
+  static std::vector<ProgramCache> program_cache_;
+
+  // Helper functions which check for errors in the status code
+  static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
+
+  // Base class constructor
+  explicit Routine(CommandQueue &queue, Event &event,
+                   const std::vector<std::string> &routines, const Precision precision);
+
+  // Set-up phase of the kernel
+  StatusCode SetUp(const std::string &routine_source);
+
+ protected:
+  
+  // Runs a kernel given the global and local thread sizes
+  StatusCode RunKernel(const Kernel &kernel, std::vector<size_t> &global,
+                       const std::vector<size_t> &local);
+
+  // Tests for valid inputs of matrices A, B, and C
+  StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer &buffer,
+                         const size_t offset, const size_t ld, const size_t data_size);
+  StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer &buffer,
+                         const size_t offset, const size_t ld, const size_t data_size);
+  StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer &buffer,
+                         const size_t offset, const size_t ld, const size_t data_size);
+
+  // Tests for valid inputs of vectors X and Y
+  StatusCode TestVectorX(const size_t n, const Buffer &buffer, const size_t offset,
+                         const size_t inc, const size_t data_size);
+  StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset,
+                         const size_t inc, const size_t data_size);
+
+  // Copies/transposes a matrix and padds/unpads it
+  StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
+                                    const size_t src_ld, const size_t src_offset,
+                                    const Buffer &src,
+                                    const size_t dest_one, const size_t dest_two,
+                                    const size_t dest_ld, const size_t dest_offset,
+                                    const Buffer &dest,
+                                    const bool do_transpose, const bool pad,
+                                    const Program &program);
+  
+  // Queries the cache and retrieve either a matching program or a boolean whether a match exists.
+  // The first assumes that the program is available in the cache and will throw an exception
+  // otherwise.
+  Program GetProgramFromCache() const;
+  bool ProgramIsInCache() const;
+
+  // Non-static variable for the precision. Note that the same variable (but static) might exist in
+  // a derived class.
+  const Precision precision_;
+
+  // The OpenCL objects, accessible only from derived classes
+  CommandQueue queue_;
+  Event event_;
+  const Context context_;
+  const Device device_;
+
+  // OpenCL device properties
+  const std::string device_name_;
+  const cl_uint max_work_item_dimensions_;
+  const std::vector<size_t> max_work_item_sizes_;
+  const size_t max_work_group_size_;
+
+  // Connection to the database for all the device-specific parameters
+  const Database db_;
+  const std::vector<std::string> routines_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINE_H_
+#endif
diff --git a/include/internal/routines/xaxpy.h b/include/internal/routines/xaxpy.h
new file mode 100644
index 00000000..e548e553
--- /dev/null
+++ b/include/internal/routines/xaxpy.h
@@ -0,0 +1,42 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAXPY_H_
+#define CLBLAST_ROUTINES_XAXPY_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xaxpy: public Routine {
+ public:
+  Xaxpy(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoAxpy(const size_t n, const T alpha,
+                    const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer &y_buffer, const size_t y_offset, const size_t y_inc);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAXPY_H_
+#endif
diff --git a/include/internal/routines/xgemm.h b/include/internal/routines/xgemm.h
new file mode 100644
index 00000000..7ad4fcfb
--- /dev/null
+++ b/include/internal/routines/xgemm.h
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGEMM_H_
+#define CLBLAST_ROUTINES_XGEMM_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgemm: public Routine {
+ public:
+  Xgemm(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                    const size_t m, const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGEMM_H_
+#endif
diff --git a/include/internal/routines/xsymm.h b/include/internal/routines/xsymm.h
new file mode 100644
index 00000000..c6545164
--- /dev/null
+++ b/include/internal/routines/xsymm.h
@@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm routine. It is based on the generalized matrix multiplication
+// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the
+// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by
+// transforming it into a general matrix, and then calls the regular GEMM code.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYMM_H_
+#define CLBLAST_ROUTINES_XSYMM_H_
+
+#include "internal/routines/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsymm: public Xgemm<T> {
+ public:
+
+  // Uses several variables from the Routine class
+  using Routine::db_;
+  using Routine::context_;
+
+  // Uses several helper functions from the Routine class
+  using Routine::RunKernel;
+  using Routine::ErrorIn;
+  using Routine::TestMatrixA;
+  using Routine::GetProgramFromCache;
+
+  // Uses the regular Xgemm routine
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xsymm(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYMM_H_
+#endif
diff --git a/include/internal/tuning.h b/include/internal/tuning.h
new file mode 100644
index 00000000..7768888c
--- /dev/null
+++ b/include/internal/tuning.h
@@ -0,0 +1,53 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the header for the tuner functions. This is only used for the optional
+// and stand-alone tuner binaries and not part of the core of CLBlast. The convention used here is
+// that X and Y are vectors, while A, B, and C are matrices.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TUNING_H_
+#define CLBLAST_TUNING_H_
+
+#include <vector>
+#include <functional>
+
+#include <cltune.h>
+
+namespace clblast {
+// =================================================================================================
+
+// Functions with two or three OpenCL memory buffers
+template <typename T>
+using Tuner2 = std::function<void(const Arguments<T>&,
+                                  const std::vector<T>&, std::vector<T>&,
+                                  cltune::Tuner&)>;
+template <typename T>
+using Tuner3 = std::function<void(const Arguments<T>&,
+                                  const std::vector<T>&, const std::vector<T>&, std::vector<T>&,
+                                  cltune::Tuner&)>;
+
+// Tuner for vector-vector input
+template <typename T>
+void TunerXY(int argc, char* argv[], const Tuner2<T> &tune_function);
+
+// Tuner for matrix-matrix input
+template <typename T>
+void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function);
+
+// Tuner for matrix-matrix-matrix input
+template <typename T>
+void TunerABC(int argc, char* argv[], const Tuner3<T> &tune_function);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TUNING_H_
+#endif
diff --git a/include/internal/utilities.h b/include/internal/utilities.h
new file mode 100644
index 00000000..af04dfdb
--- /dev/null
+++ b/include/internal/utilities.h
@@ -0,0 +1,174 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides declarations for the common (test) utility functions such as a command-line
+// argument parser. On top of this, it serves as the 'common' header, including the C++ OpenCL
+// wrapper. These utilities are not only used for CLBlast, but also included as part of the tuners,
+// the performance client and the correctness testers.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_UTILITIES_H_
+#define CLBLAST_UTILITIES_H_
+
+#include <string>
+#include <functional>
+#include <complex>
+
+#include "clblast.h"
+#include "internal/clpp11.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Shorthands for complex data-types
+using float2 = std::complex<float>;
+using double2 = std::complex<double>;
+
+// =================================================================================================
+
+// The routine-specific arguments in string form
+constexpr auto kArgM = "m";
+constexpr auto kArgN = "n";
+constexpr auto kArgK = "k";
+constexpr auto kArgLayout = "layout";
+constexpr auto kArgATransp = "transA";
+constexpr auto kArgBTransp = "transB";
+constexpr auto kArgSide = "side";
+constexpr auto kArgTriangle = "triangle";
+constexpr auto kArgXInc = "incx";
+constexpr auto kArgYInc = "incy";
+constexpr auto kArgXOffset = "offx";
+constexpr auto kArgYOffset = "offy";
+constexpr auto kArgALeadDim = "lda";
+constexpr auto kArgBLeadDim = "ldb";
+constexpr auto kArgCLeadDim = "ldc";
+constexpr auto kArgAOffset = "offa";
+constexpr auto kArgBOffset = "offb";
+constexpr auto kArgCOffset = "offc";
+constexpr auto kArgAlpha = "alpha";
+constexpr auto kArgBeta = "beta";
+
+// The tuner-specific arguments in string form
+constexpr auto kArgFraction = "fraction";
+
+// The client-specific arguments in string form
+constexpr auto kArgCompareclblas = "clblas";
+constexpr auto kArgStepSize = "step";
+constexpr auto kArgNumSteps = "num_steps";
+constexpr auto kArgNumRuns = "runs";
+
+// The common arguments in string form
+constexpr auto kArgPlatform = "platform";
+constexpr auto kArgDevice = "device";
+constexpr auto kArgPrecision = "precision";
+constexpr auto kArgHelp = "h";
+constexpr auto kArgQuiet = "q";
+constexpr auto kArgNoAbbreviations = "no_abbrv";
+
+// =================================================================================================
+
+// Structure containing all possible arguments for test clients, including their default values
+template <typename T>
+struct Arguments {
+  // Routine-specific arguments
+  size_t m = 0;
+  size_t n = 0;
+  size_t k = 0;
+  Layout layout = Layout::kRowMajor;
+  Transpose a_transpose = Transpose::kNo;
+  Transpose b_transpose = Transpose::kNo;
+  Side side = Side::kLeft;
+  Triangle triangle = Triangle::kUpper;
+  size_t x_inc = 1;
+  size_t y_inc = 1;
+  size_t x_offset = 0;
+  size_t y_offset = 0;
+  size_t a_ld = 0;
+  size_t b_ld = 0;
+  size_t c_ld = 0;
+  size_t a_offset = 0;
+  size_t b_offset = 0;
+  size_t c_offset = 0;
+  T alpha = T{1.0};
+  T beta = T{1.0};
+  // Tuner-specific arguments
+  double fraction = 1.0;
+  // Client-specific arguments
+  bool compare_clblas = 1;
+  size_t step = 1;
+  size_t num_steps = 0;
+  size_t num_runs = 10;
+  // Common arguments
+  size_t platform_id = 0;
+  size_t device_id = 0;
+  Precision precision = Precision::kSingle;
+  bool print_help = false;
+  bool silent = false;
+  bool no_abbrv = false;
+};
+
+// =================================================================================================
+
+// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
+// data-types such as the Layout and Transpose data-types.
+template <typename T>
+std::string ToString(T value);
+
+// =================================================================================================
+
+// Helper for the function "GetArgument"
+template <typename T>
+T ConvertArgument(const char* value);
+
+// Basic argument parser, matching patterns in the form of "-option value" and "--option value"
+template <typename T>
+T GetArgument(const int argc, char *argv[], std::string &help,
+              const std::string &option, const T default_value);
+
+// Returns the precision only
+Precision GetPrecision(const int argc, char *argv[]);
+
+// As in "GetArgument", but now only checks whether an argument is given or not
+bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option);
+
+// =================================================================================================
+
+// Returns a random number to be used as a seed
+unsigned int GetRandomSeed();
+
+// Populates a vector with random data
+template <typename T>
+void PopulateVector(std::vector<T> &vector);
+
+// =================================================================================================
+
+// Returns a scalar with a default value
+template <typename T>
+T GetScalar();
+
+// =================================================================================================
+
+// Rounding functions
+size_t CeilDiv(const size_t x, const size_t y);
+size_t Ceil(const size_t x, const size_t y);
+
+// Returns whether or not 'a' is a multiple of 'b'
+bool IsMultiple(const size_t a, const size_t b);
+
+// =================================================================================================
+
+// Convert the precision enum into bytes, e.g. a double takes up 8 bytes
+size_t GetBytes(const Precision precision);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_UTILITIES_H_
+#endif
author	CNugteren <web@cedricnugteren.nl>	2015-05-30 12:30:43 +0200
committer	CNugteren <web@cedricnugteren.nl>	2015-05-30 12:30:43 +0200
commit	bc5a341dfe591946e925db315fc7d8c0c25c2938 (patch)
tree	b216ab5eee4863e3807d92b5ddd19fa22197ed22 /include
parent	c7b054ea6747039f4405fd93da6e924f3e5c7f4b (diff)