diff options
author | CNugteren <web@cedricnugteren.nl> | 2015-05-30 12:30:43 +0200 |
---|---|---|
committer | CNugteren <web@cedricnugteren.nl> | 2015-05-30 12:30:43 +0200 |
commit | bc5a341dfe591946e925db315fc7d8c0c25c2938 (patch) | |
tree | b216ab5eee4863e3807d92b5ddd19fa22197ed22 /include/internal | |
parent | c7b054ea6747039f4405fd93da6e924f3e5c7f4b (diff) |
Initial commit of preview version
Diffstat (limited to 'include/internal')
-rw-r--r-- | include/internal/clpp11.h | 524 | ||||
-rw-r--r-- | include/internal/database.h | 90 | ||||
-rw-r--r-- | include/internal/database/copy.h | 130 | ||||
-rw-r--r-- | include/internal/database/pad.h | 130 | ||||
-rw-r--r-- | include/internal/database/padtranspose.h | 130 | ||||
-rw-r--r-- | include/internal/database/transpose.h | 130 | ||||
-rw-r--r-- | include/internal/database/xaxpy.h | 129 | ||||
-rw-r--r-- | include/internal/database/xgemm.h | 133 | ||||
-rw-r--r-- | include/internal/routine.h | 132 | ||||
-rw-r--r-- | include/internal/routines/xaxpy.h | 42 | ||||
-rw-r--r-- | include/internal/routines/xgemm.h | 46 | ||||
-rw-r--r-- | include/internal/routines/xsymm.h | 60 | ||||
-rw-r--r-- | include/internal/tuning.h | 53 | ||||
-rw-r--r-- | include/internal/utilities.h | 174 |
14 files changed, 1903 insertions, 0 deletions
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h new file mode 100644 index 00000000..73040fdb --- /dev/null +++ b/include/internal/clpp11.h @@ -0,0 +1,524 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a C++11 wrapper around some OpenCL C data-types, similar to Khronos' cl.hpp. +// The main differences are modern C++11 support and a straightforward implemenation of the basic +// needs (as required for this project). It also includes some extra functionality not available +// in cl.hpp, such as including the sources with a Program object and querying a Kernel's validity +// in terms of local memory usage. +// +// This file is adapted from the C++ bindings from the CLTune project and therefore contains the +// following copyright notice: +// +// ================================================================================================= +// +// Copyright 2014 SURFsara +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ================================================================================================= + +#ifndef CLBLAST_CLPP11_H_ +#define CLBLAST_CLPP11_H_ + +#include <utility> // std::swap +#include <algorithm> // std::copy +#include <string> // std::string +#include <vector> // std::vector +#include <stdexcept> // std::runtime_error + +// Includes the normal OpenCL C header +#if defined(__APPLE__) || defined(__MACOSX) + #include <OpenCL/opencl.h> +#else + #include <CL/opencl.h> +#endif + +namespace clblast { +// ================================================================================================= + +// Base class for any object +class Object { + protected: + + // Error handling (NOTE: these functions are [[noreturn]]) + void Error(const std::string &message) const { + throw std::runtime_error("Internal OpenCL error: "+message); + } + void Error(const cl_int status) const { + throw std::runtime_error("Internal OpenCL error with status: "+std::to_string(status)); + } +}; + +// ================================================================================================= + +// Base class for objects which require memory management +class ObjectWithState: public Object { + +}; + +// ================================================================================================= + +// C++11 version of cl_event +class Event: public Object { + public: + + // Constructor based on the plain C data-type + explicit Event(const cl_event event): event_(event) { } + + // New event + Event(): event_() {} + + // Public functions + size_t GetProfilingStart() const { + auto bytes = size_t{0}; + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); + auto result = size_t{0}; + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &result, nullptr); + return result; + } + size_t GetProfilingEnd() const { + auto bytes = size_t{0}; + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); + auto result = size_t{0}; + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &result, nullptr); + return result; + } + cl_int Wait() const { + return clWaitForEvents(1, &event_); + } + + // Accessors to the private data-member + cl_event operator()() const { return event_; } + cl_event& operator()() { return event_; } + private: + cl_event event_; +}; + +// ================================================================================================= + +// C++11 version of cl_platform_id +class Platform: public Object { + public: + + // Constructor based on the plain C data-type + explicit Platform(const cl_platform_id platform): platform_(platform) { } + + // Initialize the platform. Note that this constructor can throw exceptions! + explicit Platform(const size_t platform_id) { + auto num_platforms = cl_uint{0}; + auto status = clGetPlatformIDs(0, nullptr, &num_platforms); + if (status != CL_SUCCESS) { Error(status); } + if (num_platforms == 0) { Error("no platforms found"); } + auto platforms = std::vector<cl_platform_id>(num_platforms); + status = clGetPlatformIDs(num_platforms, platforms.data(), nullptr); + if (status != CL_SUCCESS) { Error(status); } + if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); } + platform_ = platforms[platform_id]; + } + + // Accessors to the private data-member + cl_platform_id operator()() const { return platform_; } + cl_platform_id& operator()() { return platform_; } + private: + cl_platform_id platform_; +}; + +// ================================================================================================= + +// C++11 version of cl_device_id +class Device: public Object { + public: + + // Constructor based on the plain C data-type + explicit Device(const cl_device_id device): device_(device) { } + + // Initialize the device. Note that this constructor can throw exceptions! + explicit Device(const Platform &platform, const cl_device_type type, const size_t device_id) { + auto num_devices = cl_uint{0}; + auto status = clGetDeviceIDs(platform(), type, 0, nullptr, &num_devices); + if (status != CL_SUCCESS) { Error(status); } + if (num_devices == 0) { Error("no devices found"); } + auto devices = std::vector<cl_device_id>(num_devices); + status = clGetDeviceIDs(platform(), type, num_devices, devices.data(), nullptr); + if (status != CL_SUCCESS) { Error(status); } + if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); } + device_ = devices[device_id]; + } + + // Public functions + std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); } + cl_device_type Type() const { return GetInfo<cl_device_type>(CL_DEVICE_TYPE); } + std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); } + std::string Name() const { return GetInfoString(CL_DEVICE_NAME); } + std::string Extensions() const { return GetInfoString(CL_DEVICE_EXTENSIONS); } + size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); } + cl_ulong LocalMemSize() const { return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE); } + cl_uint MaxWorkItemDimensions() const { + return GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS); + } + std::vector<size_t> MaxWorkItemSizes() const { + return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES); + } + + // Configuration-validity checks + bool IsLocalMemoryValid(const size_t local_mem_usage) const { + return (local_mem_usage <= LocalMemSize()); + } + bool IsThreadConfigValid(const std::vector<size_t> &local) const { + auto local_size = size_t{1}; + for (auto &item: local) { local_size *= item; } + for (auto i=size_t{0}; i<local.size(); ++i) { + if (local[i] > MaxWorkItemSizes()[i]) { return false; } + } + if (local_size > MaxWorkGroupSize()) { return false; } + if (local.size() > MaxWorkItemDimensions()) { return false; } + return true; + } + + // Accessors to the private data-member + cl_device_id operator()() const { return device_; } + cl_device_id& operator()() { return device_; } + private: + + // Helper functions + template <typename T> + T GetInfo(const cl_device_info info) const { + auto bytes = size_t{0}; + clGetDeviceInfo(device_, info, 0, nullptr, &bytes); + auto result = T(0); + clGetDeviceInfo(device_, info, bytes, &result, nullptr); + return result; + } + template <typename T> + std::vector<T> GetInfoVector(const cl_device_info info) const { + auto bytes = size_t{0}; + clGetDeviceInfo(device_, info, 0, nullptr, &bytes); + auto result = std::vector<T>(bytes/sizeof(T)); + clGetDeviceInfo(device_, info, bytes, result.data(), nullptr); + return result; + } + std::string GetInfoString(const cl_device_info info) const { + auto bytes = size_t{0}; + clGetDeviceInfo(device_, info, 0, nullptr, &bytes); + auto result = std::vector<char>(bytes); + clGetDeviceInfo(device_, info, bytes, result.data(), nullptr); + return std::string(result.data()); + } + + cl_device_id device_; +}; + +// ================================================================================================= + +// C++11 version of cl_context +class Context: public ObjectWithState { + public: + + // Constructor based on the plain C data-type + explicit Context(const cl_context context): context_(context) { + clRetainContext(context_); + } + + // Memory management + explicit Context(const Device &device) { + auto status = CL_SUCCESS; + const cl_device_id dev = device(); + context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status); + if (status != CL_SUCCESS) { Error(status); } + } + ~Context() { + clReleaseContext(context_); + } + Context(const Context &other): + context_(other.context_) { + clRetainContext(context_); + } + Context& operator=(Context other) { + swap(*this, other); + return *this; + } + friend void swap(Context &first, Context &second) { + std::swap(first.context_, second.context_); + } + + // Accessors to the private data-member + cl_context operator()() const { return context_; } + cl_context& operator()() { return context_; } + private: + cl_context context_; +}; + +// ================================================================================================= + +// C++11 version of cl_program. Additionally holds the program's source code. +class Program: public ObjectWithState { + public: + + // Note that there is no constructor based on the plain C data-type because of extra state + + // Memory management + explicit Program(const Context &context, const std::string &source): + length_(source.length()) { + std::copy(source.begin(), source.end(), back_inserter(source_)); + source_ptr_ = source_.data(); + auto status = CL_SUCCESS; + program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status); + if (status != CL_SUCCESS) { Error(status); } + } + ~Program() { + clReleaseProgram(program_); + } + Program(const Program &other): + length_(other.length_), + source_(other.source_), + source_ptr_(other.source_ptr_), + program_(other.program_) { + clRetainProgram(program_); + } + Program& operator=(Program other) { + swap(*this, other); + return *this; + } + /* + TODO: Implement move construction/assignment? + Program(Program &&other) { + clRetainProgram(program_); + swap(*this, other); + } + Program& operator=(Program &&other) { + swap(*this, other); + return *this; + }*/ + friend void swap(Program &first, Program &second) { + std::swap(first.length_, second.length_); + std::swap(first.source_, second.source_); + std::swap(first.source_ptr_, second.source_ptr_); + std::swap(first.program_, second.program_); + } + + // Public functions + cl_int Build(const Device &device, const std::string &options) { + const cl_device_id dev = device(); + return clBuildProgram(program_, 1, &dev, options.c_str(), nullptr, nullptr); + } + std::string GetBuildInfo(const Device &device) const { + auto bytes = size_t{0}; + clGetProgramBuildInfo(program_, device(), CL_PROGRAM_BUILD_LOG, 0, nullptr, &bytes); + auto result = std::vector<char>(bytes); + clGetProgramBuildInfo(program_, device(), CL_PROGRAM_BUILD_LOG, bytes, result.data(), nullptr); + return std::string(result.data()); + } + + // Accessors to the private data-member + cl_program operator()() const { return program_; } + cl_program& operator()() { return program_; } + private: + size_t length_; + std::vector<char> source_; + const char* source_ptr_; + cl_program program_; +}; + +// ================================================================================================= + +// C++11 version of cl_kernel +class Kernel: public ObjectWithState { + public: + + // Constructor based on the plain C data-type + explicit Kernel(const cl_kernel kernel): kernel_(kernel) { + clRetainKernel(kernel_); + } + + // Memory management + explicit Kernel(const Program &program, const std::string &name) { + auto status = CL_SUCCESS; + kernel_ = clCreateKernel(program(), name.c_str(), &status); + if (status != CL_SUCCESS) { Error(status); } + } + ~Kernel() { + clReleaseKernel(kernel_); + } + Kernel(const Kernel &other): + kernel_(other.kernel_) { + clRetainKernel(kernel_); + } + Kernel& operator=(Kernel other) { + swap(*this, other); + return *this; + } + friend void swap(Kernel &first, Kernel &second) { + std::swap(first.kernel_, second.kernel_); + } + + // Public functions + template <typename T> // Note: doesn't work with T=Buffer + cl_int SetArgument(const cl_uint index, const T &value) { + return clSetKernelArg(kernel_, index, sizeof(T), &value); + } + size_t LocalMemUsage(const Device &device) const { + auto bytes = size_t{0}; + clGetKernelWorkGroupInfo(kernel_, device(), CL_KERNEL_LOCAL_MEM_SIZE, 0, nullptr, &bytes); + auto result = size_t{0}; + clGetKernelWorkGroupInfo(kernel_, device(), CL_KERNEL_LOCAL_MEM_SIZE, bytes, &result, nullptr); + return result; + } + + // Accessors to the private data-member + cl_kernel operator()() const { return kernel_; } + cl_kernel& operator()() { return kernel_; } + private: + cl_kernel kernel_; +}; + +// ================================================================================================= + +// C++11 version of cl_command_queue +class CommandQueue: public ObjectWithState { + public: + + // Constructor based on the plain C data-type + explicit CommandQueue(const cl_command_queue queue): queue_(queue) { + clRetainCommandQueue(queue_); + } + + // Memory management + explicit CommandQueue(const Context &context, const Device &device) { + auto status = CL_SUCCESS; + queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); + if (status != CL_SUCCESS) { Error(status); } + } + ~CommandQueue() { + clReleaseCommandQueue(queue_); + } + CommandQueue(const CommandQueue &other): + queue_(other.queue_) { + clRetainCommandQueue(queue_); + } + CommandQueue& operator=(CommandQueue other) { + swap(*this, other); + return *this; + } + friend void swap(CommandQueue &first, CommandQueue &second) { + std::swap(first.queue_, second.queue_); + } + + // Public functions + cl_int EnqueueKernel(const Kernel &kernel, const std::vector<size_t> &global, + const std::vector<size_t> &local, Event &event) { + return clEnqueueNDRangeKernel(queue_, kernel(), static_cast<cl_uint>(global.size()), nullptr, + global.data(), local.data(), 0, nullptr, &(event())); + } + Context GetContext() const { + auto bytes = size_t{0}; + clGetCommandQueueInfo(queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes); + cl_context result; + clGetCommandQueueInfo(queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr); + return Context(result); + } + Device GetDevice() const { + auto bytes = size_t{0}; + clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes); + cl_device_id result; + clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr); + return Device(result); + } + cl_int Finish() { + return clFinish(queue_); + } + + // Accessors to the private data-member + cl_command_queue operator()() const { return queue_; } + cl_command_queue& operator()() { return queue_; } + private: + cl_command_queue queue_; +}; + +// ================================================================================================= + +// C++11 version of cl_mem +class Buffer: public ObjectWithState { + public: + + // Constructor based on the plain C data-type + explicit Buffer(const cl_mem buffer): buffer_(buffer) { + clRetainMemObject(buffer_); + } + + // Memory management + explicit Buffer(const Context &context, const cl_mem_flags flags, const size_t bytes) { + auto status = CL_SUCCESS; + buffer_ = clCreateBuffer(context(), flags, bytes, nullptr, &status); + if (status != CL_SUCCESS) { Error(status); } + } + ~Buffer() { + clReleaseMemObject(buffer_); + } + Buffer(const Buffer &other): + buffer_(other.buffer_) { + clRetainMemObject(buffer_); + } + Buffer& operator=(Buffer other) { + swap(*this, other); + return *this; + } + friend void swap(Buffer &first, Buffer &second) { + std::swap(first.buffer_, second.buffer_); + } + + // Public functions + template <typename T> + cl_int ReadBuffer(const CommandQueue &queue, const size_t bytes, T* host) { + return clEnqueueReadBuffer(queue(), buffer_, CL_TRUE, 0, bytes, host, 0, nullptr, nullptr); + } + template <typename T> + cl_int ReadBuffer(const CommandQueue &queue, const size_t bytes, std::vector<T> &host) { + return ReadBuffer(queue, bytes, host.data()); + } + template <typename T> + cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, const T* host) { + return clEnqueueWriteBuffer(queue(), buffer_, CL_TRUE, 0, bytes, host, 0, nullptr, nullptr); + } + template <typename T> + cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, const std::vector<T> &host) { + return WriteBuffer(queue, bytes, &host[0]); + } + size_t GetSize() const { + auto bytes = size_t{0}; + auto status = clGetMemObjectInfo(buffer_, CL_MEM_SIZE, 0, nullptr, &bytes); + if (status != CL_SUCCESS) { Error(status); } + auto result = size_t{0}; + status = clGetMemObjectInfo(buffer_, CL_MEM_SIZE, bytes, &result, nullptr); + if (status != CL_SUCCESS) { Error(status); } + return result; + } + + // Accessors to the private data-member + cl_mem operator()() const { return buffer_; } + cl_mem& operator()() { return buffer_; } + private: + cl_mem buffer_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_CLPP11_H_ +#endif diff --git a/include/internal/database.h b/include/internal/database.h new file mode 100644 index 00000000..dbbdd5c0 --- /dev/null +++ b/include/internal/database.h @@ -0,0 +1,90 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Database class, providing a static variable holding the actual database +// information. The class also provides utility functions to search the database and to access a +// found entry by parameter-key. The database itself is filled in the corresponding source-file and +// partially also by the database/xxxxx.h files, in which kernel-specific parameters are found. +// +// ================================================================================================= + +#ifndef CLBLAST_DATABASE_H_ +#define CLBLAST_DATABASE_H_ + +#include <string> +#include <vector> +#include <unordered_map> + +#include "internal/utilities.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +class Database { + public: + + // Type alias for the database parameters + using Parameters = std::unordered_map<std::string,size_t>; + + // Structures for content inside the database + struct DatabaseDevice { + const std::string name; + const Parameters parameters; + }; + struct DatabaseVendor { + const cl_device_type type; + const std::string name; + const std::vector<DatabaseDevice> devices; + }; + struct DatabaseEntry { + const std::string kernel; + const Precision precision; + const std::vector<DatabaseVendor> vendors; + }; + + // The default vendor or device + static constexpr auto kDefault = "Default"; + + // The database consists of separate database entries, stored together in a vector + static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble; + static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble; + static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble; + static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble; + static const DatabaseEntry TraSingle, TraDouble, TraComplexSingle, TraComplexDouble; + static const DatabaseEntry PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble; + static const std::vector<DatabaseEntry> database; + + // The constructor + explicit Database(const CommandQueue &queue, const std::vector<std::string> &routines, + const Precision precision); + + // Accessor of values by key + size_t operator[](const std::string key) const { return parameters_.find(key)->second; } + + // Obtain a list of OpenCL pre-processor defines based on the parameters + std::string GetDefines() const; + + private: + Parameters Search(const std::string &this_kernel, const cl_device_type this_type, + const std::string &this_vendor, const std::string &this_device, + const Precision this_precision) const; + + // Tests equality between a database-vendor string and an OpenCL vendor string + bool VendorEqual(const std::string &db_vendor, const std::string &cl_vendor) const; + + // Found parameters suitable for this device/kernel + Parameters parameters_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_DATABASE_H_ +#endif diff --git a/include/internal/database/copy.h b/include/internal/database/copy.h new file mode 100644 index 00000000..b9335fc9 --- /dev/null +++ b/include/internal/database/copy.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file populates the database with best-found tuning parameters for the Copy kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::CopySingle = { + "Copy", Precision::kSingle, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } }, + { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_WPT",2}, {"COPY_VW",4} } }, + { "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",4}, {"COPY_VW",4} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + { "Iris", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",4} } }, + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::CopyDouble = { + "Copy", Precision::kDouble, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + { "Tesla K20m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } }, + { "Tesla K40m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::CopyComplexSingle = { + "Copy", Precision::kComplexSingle, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + { "Tesla K20m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",1} } }, + { "Tesla K40m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + { "Iris", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::CopyComplexDouble = { + "Copy", Precision::kComplexDouble, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/include/internal/database/pad.h b/include/internal/database/pad.h new file mode 100644 index 00000000..5af75308 --- /dev/null +++ b/include/internal/database/pad.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file populates the database with best-found tuning parameters for the Pad kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::PadSingle = { + "Pad", Precision::kSingle, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, + { "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadDouble = { + "Pad", Precision::kDouble, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadComplexSingle = { + "Pad", Precision::kComplexSingle, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadComplexDouble = { + "Pad", Precision::kComplexDouble, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/include/internal/database/padtranspose.h b/include/internal/database/padtranspose.h new file mode 100644 index 00000000..f1127d60 --- /dev/null +++ b/include/internal/database/padtranspose.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file populates the database with best-found tuning parameters for the PadTranspose kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::PadTraSingle = { + "PadTranspose", Precision::kSingle, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } }, + { "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } }, + { "Tesla K40m", { {"PADTRA_TILE",32}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + { "Iris", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } }, + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadTraDouble = { + "PadTranspose", Precision::kDouble, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, + { "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, + { "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadTraComplexSingle = { + "PadTranspose", Precision::kComplexSingle, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, + { "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, + { "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + { "Iris", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } }, + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::PadTraComplexDouble = { + "PadTranspose", Precision::kComplexDouble, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, + { "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, + { "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/include/internal/database/transpose.h b/include/internal/database/transpose.h new file mode 100644 index 00000000..0814eb8a --- /dev/null +++ b/include/internal/database/transpose.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file populates the database with best-found tuning parameters for the Transpose kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::TraSingle = { + "Transpose", Precision::kSingle, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } }, + { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } }, + { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + { "Iris", { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0} } }, + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::TraDouble = { + "Transpose", Precision::kDouble, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } }, + { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } }, + { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::TraComplexSingle = { + "Transpose", Precision::kComplexSingle, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } }, + { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } }, + { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + { "Iris", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } }, + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::TraComplexDouble = { + "Transpose", Precision::kComplexDouble, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1} } }, + { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } }, + { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",0} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/include/internal/database/xaxpy.h b/include/internal/database/xaxpy.h new file mode 100644 index 00000000..c331945a --- /dev/null +++ b/include/internal/database/xaxpy.h @@ -0,0 +1,129 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file populates the database with best-found tuning parameters for the Xaxpy kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::XaxpySingle = { + "Xaxpy", Precision::kSingle, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"WGS",128}, {"WPT",1}, {"VW",2} } }, + { "Tesla K20m", { {"WGS",128}, {"WPT",2}, {"VW",2} } }, + { "Tesla K40m", { {"WGS",128}, {"WPT",1}, {"VW",4} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",2} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + { "Iris", { {"WGS",512}, {"WPT",1}, {"VW",1} } }, + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"WGS",128}, {"WPT",1}, {"VW",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XaxpyDouble = { + "Xaxpy", Precision::kDouble, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"WGS",128}, {"WPT",1}, {"VW",1} } }, + { "Tesla K20m", { {"WGS",512}, {"WPT",1}, {"VW",2} } }, + { "Tesla K40m", { {"WGS",64}, {"WPT",1}, {"VW",2} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"WGS",256}, {"WPT",1}, {"VW",1} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"WGS",128}, {"WPT",1}, {"VW",1} } }, + } + }, + } +}; +// ================================================================================================= + +const Database::DatabaseEntry Database::XaxpyComplexSingle = { + "Xaxpy", Precision::kComplexSingle, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"WGS",256}, {"WPT",1}, {"VW",1} } }, + { "Tesla K20m", { {"WGS",128}, {"WPT",1}, {"VW",1} } }, + { "Tesla K40m", { {"WGS",128}, {"WPT",2}, {"VW",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + { "Iris", { {"WGS",256}, {"WPT",1}, {"VW",1} } }, + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"WGS",128}, {"WPT",1}, {"VW",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XaxpyComplexDouble = { + "Xaxpy", Precision::kComplexDouble, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"WGS",128}, {"WPT",2}, {"VW",1} } }, + { "Tesla K20m", { {"WGS",256}, {"WPT",1}, {"VW",1} } }, + { "Tesla K40m", { {"WGS",64}, {"WPT",2}, {"VW",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"WGS",128}, {"WPT",1}, {"VW",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h new file mode 100644 index 00000000..edf41e12 --- /dev/null +++ b/include/internal/database/xgemm.h @@ -0,0 +1,133 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file populates the database with best-found tuning parameters for the Xgemm kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmSingle = { + "Xgemm", Precision::kSingle, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"MWG",128}, {"NWG",64}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, + { "Tesla K20m", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",4}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, + { "Tesla K40m", { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, + { kDefault, { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",8}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + { "Iris", { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } }, + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmDouble = { + "Xgemm", Precision::kDouble, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, + { "Tesla K20m", { {"MWG",64}, {"NWG",128}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",32}, {"KWI",8}, {"VWM",2}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } }, + { "Tesla K40m", { {"MWG",64}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, + { kDefault, { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmComplexSingle = { + "Xgemm", Precision::kComplexSingle, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } }, + { "Tesla K20m", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",8}, {"KWI",8}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } }, + { "Tesla K40m", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",0}, {"STRN",1}, {"SA",1}, {"SB",1} } }, + { kDefault, { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + { "Iris", { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } }, + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmComplexDouble = { + "Xgemm", Precision::kComplexDouble, { + { // NVIDIA GPUs + CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + { "GeForce GTX 480", { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } }, + { "Tesla K20m", { {"MWG",16}, {"NWG",128}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",8}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } }, + { "Tesla K40m", { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",1} } }, + { kDefault, { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } }, + } + }, + { // AMD GPUs + CL_DEVICE_TYPE_GPU, "AMD", { + { "Tahiti", { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, + } + }, + { // Intel GPUs + CL_DEVICE_TYPE_GPU, "Intel", { + } + }, + { // Default + CL_DEVICE_TYPE_ALL, kDefault, { + { kDefault, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, + } + }, + } +}; +// ================================================================================================= +} // namespace clblast diff --git a/include/internal/routine.h b/include/internal/routine.h new file mode 100644 index 00000000..42357d8d --- /dev/null +++ b/include/internal/routine.h @@ -0,0 +1,132 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements all the basic functionality for the BLAS routines. This class serves as a +// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as +// compiling the OpenCL kernel, connecting to the database, etc. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINE_H_ +#define CLBLAST_ROUTINE_H_ + +#include <string> +#include <vector> + +#include "internal/utilities.h" +#include "internal/database.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +class Routine { + public: + + // Khronos OpenCL extensions + const std::string kKhronosHalfPrecision = "cl_khr_fp16"; + const std::string kKhronosDoublePrecision = "cl_khr_fp64"; + + // New data-type:tThe cache of compiled OpenCL programs, along with some meta-data + struct ProgramCache { + Program program; + std::string device_name; + Precision precision; + std::vector<std::string> routines; + + // Finds out whether the properties match + bool MatchInCache(const std::string &ref_name, const Precision &ref_precision, + const std::vector<std::string> &ref_routines) { + auto ref_size = ref_routines.size(); + if (device_name == ref_name && precision == ref_precision && routines.size() == ref_size) { + auto found_match = true; + for (auto i=size_t{0}; i<ref_size; ++i) { + if (routines[i] != ref_routines[i]) { found_match = false; } + } + return found_match; + } + return false; + } + }; + + // The actual cache, implemented as a vector of the above data-type + static std::vector<ProgramCache> program_cache_; + + // Helper functions which check for errors in the status code + static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); } + + // Base class constructor + explicit Routine(CommandQueue &queue, Event &event, + const std::vector<std::string> &routines, const Precision precision); + + // Set-up phase of the kernel + StatusCode SetUp(const std::string &routine_source); + + protected: + + // Runs a kernel given the global and local thread sizes + StatusCode RunKernel(const Kernel &kernel, std::vector<size_t> &global, + const std::vector<size_t> &local); + + // Tests for valid inputs of matrices A, B, and C + StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer &buffer, + const size_t offset, const size_t ld, const size_t data_size); + StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer &buffer, + const size_t offset, const size_t ld, const size_t data_size); + StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer &buffer, + const size_t offset, const size_t ld, const size_t data_size); + + // Tests for valid inputs of vectors X and Y + StatusCode TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, + const size_t inc, const size_t data_size); + StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, + const size_t inc, const size_t data_size); + + // Copies/transposes a matrix and padds/unpads it + StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, + const size_t src_ld, const size_t src_offset, + const Buffer &src, + const size_t dest_one, const size_t dest_two, + const size_t dest_ld, const size_t dest_offset, + const Buffer &dest, + const bool do_transpose, const bool pad, + const Program &program); + + // Queries the cache and retrieve either a matching program or a boolean whether a match exists. + // The first assumes that the program is available in the cache and will throw an exception + // otherwise. + Program GetProgramFromCache() const; + bool ProgramIsInCache() const; + + // Non-static variable for the precision. Note that the same variable (but static) might exist in + // a derived class. + const Precision precision_; + + // The OpenCL objects, accessible only from derived classes + CommandQueue queue_; + Event event_; + const Context context_; + const Device device_; + + // OpenCL device properties + const std::string device_name_; + const cl_uint max_work_item_dimensions_; + const std::vector<size_t> max_work_item_sizes_; + const size_t max_work_group_size_; + + // Connection to the database for all the device-specific parameters + const Database db_; + const std::vector<std::string> routines_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINE_H_ +#endif diff --git a/include/internal/routines/xaxpy.h b/include/internal/routines/xaxpy.h new file mode 100644 index 00000000..e548e553 --- /dev/null +++ b/include/internal/routines/xaxpy.h @@ -0,0 +1,42 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xaxpy routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XAXPY_H_ +#define CLBLAST_ROUTINES_XAXPY_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xaxpy: public Routine { + public: + Xaxpy(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoAxpy(const size_t n, const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XAXPY_H_ +#endif diff --git a/include/internal/routines/xgemm.h b/include/internal/routines/xgemm.h new file mode 100644 index 00000000..7ad4fcfb --- /dev/null +++ b/include/internal/routines/xgemm.h @@ -0,0 +1,46 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xgemm routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XGEMM_H_ +#define CLBLAST_ROUTINES_XGEMM_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xgemm: public Routine { + public: + Xgemm(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XGEMM_H_ +#endif diff --git a/include/internal/routines/xsymm.h b/include/internal/routines/xsymm.h new file mode 100644 index 00000000..c6545164 --- /dev/null +++ b/include/internal/routines/xsymm.h @@ -0,0 +1,60 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xsymm routine. It is based on the generalized matrix multiplication +// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the +// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by +// transforming it into a general matrix, and then calls the regular GEMM code. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYMM_H_ +#define CLBLAST_ROUTINES_XSYMM_H_ + +#include "internal/routines/xgemm.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xsymm: public Xgemm<T> { + public: + + // Uses several variables from the Routine class + using Routine::db_; + using Routine::context_; + + // Uses several helper functions from the Routine class + using Routine::RunKernel; + using Routine::ErrorIn; + using Routine::TestMatrixA; + using Routine::GetProgramFromCache; + + // Uses the regular Xgemm routine + using Xgemm<T>::DoGemm; + + // Constructor + Xsymm(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYMM_H_ +#endif diff --git a/include/internal/tuning.h b/include/internal/tuning.h new file mode 100644 index 00000000..7768888c --- /dev/null +++ b/include/internal/tuning.h @@ -0,0 +1,53 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the header for the tuner functions. This is only used for the optional +// and stand-alone tuner binaries and not part of the core of CLBlast. The convention used here is +// that X and Y are vectors, while A, B, and C are matrices. +// +// ================================================================================================= + +#ifndef CLBLAST_TUNING_H_ +#define CLBLAST_TUNING_H_ + +#include <vector> +#include <functional> + +#include <cltune.h> + +namespace clblast { +// ================================================================================================= + +// Functions with two or three OpenCL memory buffers +template <typename T> +using Tuner2 = std::function<void(const Arguments<T>&, + const std::vector<T>&, std::vector<T>&, + cltune::Tuner&)>; +template <typename T> +using Tuner3 = std::function<void(const Arguments<T>&, + const std::vector<T>&, const std::vector<T>&, std::vector<T>&, + cltune::Tuner&)>; + +// Tuner for vector-vector input +template <typename T> +void TunerXY(int argc, char* argv[], const Tuner2<T> &tune_function); + +// Tuner for matrix-matrix input +template <typename T> +void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function); + +// Tuner for matrix-matrix-matrix input +template <typename T> +void TunerABC(int argc, char* argv[], const Tuner3<T> &tune_function); + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TUNING_H_ +#endif diff --git a/include/internal/utilities.h b/include/internal/utilities.h new file mode 100644 index 00000000..af04dfdb --- /dev/null +++ b/include/internal/utilities.h @@ -0,0 +1,174 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file provides declarations for the common (test) utility functions such as a command-line +// argument parser. On top of this, it serves as the 'common' header, including the C++ OpenCL +// wrapper. These utilities are not only used for CLBlast, but also included as part of the tuners, +// the performance client and the correctness testers. +// +// ================================================================================================= + +#ifndef CLBLAST_UTILITIES_H_ +#define CLBLAST_UTILITIES_H_ + +#include <string> +#include <functional> +#include <complex> + +#include "clblast.h" +#include "internal/clpp11.h" + +namespace clblast { +// ================================================================================================= + +// Shorthands for complex data-types +using float2 = std::complex<float>; +using double2 = std::complex<double>; + +// ================================================================================================= + +// The routine-specific arguments in string form +constexpr auto kArgM = "m"; +constexpr auto kArgN = "n"; +constexpr auto kArgK = "k"; +constexpr auto kArgLayout = "layout"; +constexpr auto kArgATransp = "transA"; +constexpr auto kArgBTransp = "transB"; +constexpr auto kArgSide = "side"; +constexpr auto kArgTriangle = "triangle"; +constexpr auto kArgXInc = "incx"; +constexpr auto kArgYInc = "incy"; +constexpr auto kArgXOffset = "offx"; +constexpr auto kArgYOffset = "offy"; +constexpr auto kArgALeadDim = "lda"; +constexpr auto kArgBLeadDim = "ldb"; +constexpr auto kArgCLeadDim = "ldc"; +constexpr auto kArgAOffset = "offa"; +constexpr auto kArgBOffset = "offb"; +constexpr auto kArgCOffset = "offc"; +constexpr auto kArgAlpha = "alpha"; +constexpr auto kArgBeta = "beta"; + +// The tuner-specific arguments in string form +constexpr auto kArgFraction = "fraction"; + +// The client-specific arguments in string form +constexpr auto kArgCompareclblas = "clblas"; +constexpr auto kArgStepSize = "step"; +constexpr auto kArgNumSteps = "num_steps"; +constexpr auto kArgNumRuns = "runs"; + +// The common arguments in string form +constexpr auto kArgPlatform = "platform"; +constexpr auto kArgDevice = "device"; +constexpr auto kArgPrecision = "precision"; +constexpr auto kArgHelp = "h"; +constexpr auto kArgQuiet = "q"; +constexpr auto kArgNoAbbreviations = "no_abbrv"; + +// ================================================================================================= + +// Structure containing all possible arguments for test clients, including their default values +template <typename T> +struct Arguments { + // Routine-specific arguments + size_t m = 0; + size_t n = 0; + size_t k = 0; + Layout layout = Layout::kRowMajor; + Transpose a_transpose = Transpose::kNo; + Transpose b_transpose = Transpose::kNo; + Side side = Side::kLeft; + Triangle triangle = Triangle::kUpper; + size_t x_inc = 1; + size_t y_inc = 1; + size_t x_offset = 0; + size_t y_offset = 0; + size_t a_ld = 0; + size_t b_ld = 0; + size_t c_ld = 0; + size_t a_offset = 0; + size_t b_offset = 0; + size_t c_offset = 0; + T alpha = T{1.0}; + T beta = T{1.0}; + // Tuner-specific arguments + double fraction = 1.0; + // Client-specific arguments + bool compare_clblas = 1; + size_t step = 1; + size_t num_steps = 0; + size_t num_runs = 10; + // Common arguments + size_t platform_id = 0; + size_t device_id = 0; + Precision precision = Precision::kSingle; + bool print_help = false; + bool silent = false; + bool no_abbrv = false; +}; + +// ================================================================================================= + +// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast +// data-types such as the Layout and Transpose data-types. +template <typename T> +std::string ToString(T value); + +// ================================================================================================= + +// Helper for the function "GetArgument" +template <typename T> +T ConvertArgument(const char* value); + +// Basic argument parser, matching patterns in the form of "-option value" and "--option value" +template <typename T> +T GetArgument(const int argc, char *argv[], std::string &help, + const std::string &option, const T default_value); + +// Returns the precision only +Precision GetPrecision(const int argc, char *argv[]); + +// As in "GetArgument", but now only checks whether an argument is given or not +bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option); + +// ================================================================================================= + +// Returns a random number to be used as a seed +unsigned int GetRandomSeed(); + +// Populates a vector with random data +template <typename T> +void PopulateVector(std::vector<T> &vector); + +// ================================================================================================= + +// Returns a scalar with a default value +template <typename T> +T GetScalar(); + +// ================================================================================================= + +// Rounding functions +size_t CeilDiv(const size_t x, const size_t y); +size_t Ceil(const size_t x, const size_t y); + +// Returns whether or not 'a' is a multiple of 'b' +bool IsMultiple(const size_t a, const size_t b); + +// ================================================================================================= + +// Convert the precision enum into bytes, e.g. a double takes up 8 bytes +size_t GetBytes(const Precision precision); + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_UTILITIES_H_ +#endif |