From f7199b831f847340f0921ef2140a4e64809db037 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Mon, 27 Jul 2015 07:18:06 +0200 Subject: Now using the new Claduc C++11 OpenCL header --- include/internal/clpp11.h | 640 +++++++++++++++++------------- include/internal/database.h | 26 +- include/internal/database/copy.h | 40 +- include/internal/database/pad.h | 40 +- include/internal/database/padtranspose.h | 40 +- include/internal/database/transpose.h | 40 +- include/internal/database/xaxpy.h | 40 +- include/internal/database/xgemm.h | 48 +-- include/internal/database/xgemv.h | 40 +- include/internal/routine.h | 23 +- include/internal/routines/level1/xaxpy.h | 20 +- include/internal/routines/level2/xgemv.h | 23 +- include/internal/routines/level3/xgemm.h | 25 +- include/internal/routines/level3/xhemm.h | 24 +- include/internal/routines/level3/xher2k.h | 25 +- include/internal/routines/level3/xherk.h | 22 +- include/internal/routines/level3/xsymm.h | 24 +- include/internal/routines/level3/xsyr2k.h | 25 +- include/internal/routines/level3/xsyrk.h | 22 +- include/internal/routines/level3/xtrmm.h | 22 +- include/internal/utilities.h | 11 +- src/clblast.cc | 78 ++-- src/database.cc | 18 +- src/routine.cc | 98 +++-- src/routines/level1/xaxpy.cc | 8 +- src/routines/level2/xgemv.cc | 10 +- src/routines/level3/xgemm.cc | 16 +- src/routines/level3/xhemm.cc | 10 +- src/routines/level3/xher2k.cc | 20 +- src/routines/level3/xherk.cc | 14 +- src/routines/level3/xsymm.cc | 10 +- src/routines/level3/xsyr2k.cc | 16 +- src/routines/level3/xsyrk.cc | 12 +- src/routines/level3/xtrmm.cc | 8 +- test/correctness/testblas.cc | 68 ++-- test/correctness/testblas.h | 4 +- test/correctness/tester.cc | 8 +- test/correctness/tester.h | 5 +- test/performance/client.cc | 28 +- test/performance/client.h | 9 +- test/routines/level1/xaxpy.h | 11 +- test/routines/level2/xgemv.h | 11 +- test/routines/level3/xgemm.h | 11 +- test/routines/level3/xhemm.h | 11 +- test/routines/level3/xher2k.h | 11 +- test/routines/level3/xherk.h | 11 +- test/routines/level3/xsymm.h | 11 +- test/routines/level3/xsyr2k.h | 11 +- test/routines/level3/xsyrk.h | 11 +- test/routines/level3/xtrmm.h | 11 +- 50 files changed, 970 insertions(+), 800 deletions(-) diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index d48b646d..2c2cc797 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -7,18 +7,17 @@ // Author(s): // Cedric Nugteren // -// This file implements a C++11 wrapper around some OpenCL C data-types, similar to Khronos' cl.hpp. -// The main differences are modern C++11 support and a straightforward implemenation of the basic -// needs (as required for this project). It also includes some extra functionality not available -// in cl.hpp, such as including the sources with a Program object and querying a Kernel's validity -// in terms of local memory usage. +// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API +// calls. The main benefits are increased abstraction, automatic memory management, and portability. +// Portability here means that a similar header exists for CUDA with the same classes and +// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. // -// This file is adapted from the C++ bindings from the CLTune project and therefore contains the -// following copyright notice: +// This file is taken from the Claduc project and therefore +// contains the following header copyright notice: // // ================================================================================================= // -// Copyright 2014 SURFsara +// Copyright 2015 SURFsara // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -37,13 +36,15 @@ #ifndef CLBLAST_CLPP11_H_ #define CLBLAST_CLPP11_H_ -#include // std::swap +// C++ #include // std::copy -#include // std::string -#include // std::vector +#include // std::string +#include // std::vector +#include // std::shared_ptr #include // std::runtime_error +#include // std::accumulate -// Includes the normal OpenCL C header +// OpenCL #if defined(__APPLE__) || defined(__MACOSX) #include #else @@ -53,59 +54,46 @@ namespace clblast { // ================================================================================================= -// Base class for any object -class Object { - protected: +// Error occurred in the C++11 OpenCL header (this file) +inline void Error(const std::string &message) { + throw std::runtime_error("Internal OpenCL error: "+message); +} - // Error handling (NOTE: these functions are [[noreturn]]) - void Error(const std::string &message) const { - throw std::runtime_error("Internal OpenCL error: "+message); +// Error occurred in OpenCL +inline void CheckError(const cl_int status) { + if (status != CL_SUCCESS) { + throw std::runtime_error("Internal OpenCL error: "+std::to_string(status)); } - void Error(const cl_int status) const { - throw std::runtime_error("Internal OpenCL error with status: "+std::to_string(status)); - } -}; - -// ================================================================================================= - -// Base class for objects which require memory management -class ObjectWithState: public Object { - -}; +} // ================================================================================================= -// C++11 version of cl_event -class Event: public Object { +// C++11 version of 'cl_event' +class Event { public: - // Constructor based on the plain C data-type + // Constructor based on the regular OpenCL data-type explicit Event(const cl_event event): event_(event) { } - // New event - Event(): event_() {} + // Regular constructor + explicit Event() { } - // Public functions - size_t GetProfilingStart() const { + // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on + // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation: + // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx + float GetElapsedTime() const { + CheckError(clWaitForEvents(1, &event_)); auto bytes = size_t{0}; clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); - auto result = size_t{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &result, nullptr); - return result; - } - size_t GetProfilingEnd() const { - auto bytes = size_t{0}; + auto time_start = size_t{0}; + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); - auto result = size_t{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &result, nullptr); - return result; - } - cl_int Wait() const { - return clWaitForEvents(1, &event_); + auto time_end = size_t{0}; + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); + return (time_end - time_start) * 1.0e-6f; } - // Accessors to the private data-member - cl_event operator()() const { return event_; } + // Accessor to the private data-member cl_event& operator()() { return event_; } private: cl_event event_; @@ -113,27 +101,25 @@ class Event: public Object { // ================================================================================================= -// C++11 version of cl_platform_id -class Platform: public Object { +// C++11 version of 'cl_platform_id' +class Platform { public: - // Constructor based on the plain C data-type + // Constructor based on the regular OpenCL data-type explicit Platform(const cl_platform_id platform): platform_(platform) { } - // Initialize the platform. Note that this constructor can throw exceptions! + // Initializes the platform explicit Platform(const size_t platform_id) { auto num_platforms = cl_uint{0}; - auto status = clGetPlatformIDs(0, nullptr, &num_platforms); - if (status != CL_SUCCESS) { Error(status); } + CheckError(clGetPlatformIDs(0, nullptr, &num_platforms)); if (num_platforms == 0) { Error("no platforms found"); } auto platforms = std::vector(num_platforms); - status = clGetPlatformIDs(num_platforms, platforms.data(), nullptr); - if (status != CL_SUCCESS) { Error(status); } + CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr)); if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); } platform_ = platforms[platform_id]; } - // Accessors to the private data-member + // Accessor to the private data-member const cl_platform_id& operator()() const { return platform_; } private: cl_platform_id platform_; @@ -141,40 +127,53 @@ class Platform: public Object { // ================================================================================================= -// C++11 version of cl_device_id -class Device: public Object { +// C++11 version of 'cl_device_id' +class Device { public: - // Constructor based on the plain C data-type + // Constructor based on the regular OpenCL data-type explicit Device(const cl_device_id device): device_(device) { } // Initialize the device. Note that this constructor can throw exceptions! - explicit Device(const Platform &platform, const cl_device_type type, const size_t device_id) { + explicit Device(const Platform &platform, const size_t device_id) { auto num_devices = cl_uint{0}; - auto status = clGetDeviceIDs(platform(), type, 0, nullptr, &num_devices); - if (status != CL_SUCCESS) { Error(status); } + CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, 0, nullptr, &num_devices)); if (num_devices == 0) { Error("no devices found"); } auto devices = std::vector(num_devices); - status = clGetDeviceIDs(platform(), type, num_devices, devices.data(), nullptr); - if (status != CL_SUCCESS) { Error(status); } + CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, num_devices, devices.data(), nullptr)); if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); } device_ = devices[device_id]; } - // Public functions - std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); } - cl_device_type Type() const { return GetInfo(CL_DEVICE_TYPE); } - std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); } - std::string Name() const { return GetInfoString(CL_DEVICE_NAME); } - std::string Extensions() const { return GetInfoString(CL_DEVICE_EXTENSIONS); } + // Methods to retrieve device information + std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); } + std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); } + std::string Name() const { return GetInfoString(CL_DEVICE_NAME); } + std::string Type() const { + auto type = GetInfo(CL_DEVICE_TYPE); + switch(type) { + case CL_DEVICE_TYPE_CPU: return "CPU"; + case CL_DEVICE_TYPE_GPU: return "GPU"; + case CL_DEVICE_TYPE_ACCELERATOR: return "accelerator"; + default: return "default"; + } + } size_t MaxWorkGroupSize() const { return GetInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE); } - cl_ulong LocalMemSize() const { return GetInfo(CL_DEVICE_LOCAL_MEM_SIZE); } - cl_uint MaxWorkItemDimensions() const { - return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS); + size_t MaxWorkItemDimensions() const { + return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS); } std::vector MaxWorkItemSizes() const { return GetInfoVector(CL_DEVICE_MAX_WORK_ITEM_SIZES); } + size_t LocalMemSize() const { + return static_cast(GetInfo(CL_DEVICE_LOCAL_MEM_SIZE)); + } + std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); } + size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); } + size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); } + size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); } + size_t MemoryClock() const { return 0; } // Not exposed in OpenCL + size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL // Configuration-validity checks bool IsLocalMemoryValid(const size_t local_mem_usage) const { @@ -182,7 +181,7 @@ class Device: public Object { } bool IsThreadConfigValid(const std::vector &local) const { auto local_size = size_t{1}; - for (auto &item: local) { local_size *= item; } + for (const auto &item: local) { local_size *= item; } for (auto i=size_t{0}; i MaxWorkItemSizes()[i]) { return false; } } @@ -191,313 +190,404 @@ class Device: public Object { return true; } - // Accessors to the private data-member + // Accessor to the private data-member const cl_device_id& operator()() const { return device_; } private: + cl_device_id device_; - // Helper functions + // Private helper functions template T GetInfo(const cl_device_info info) const { auto bytes = size_t{0}; - clGetDeviceInfo(device_, info, 0, nullptr, &bytes); + CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); auto result = T(0); - clGetDeviceInfo(device_, info, bytes, &result, nullptr); + CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr)); return result; } + size_t GetInfo(const cl_device_info info) const { + auto bytes = size_t{0}; + CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); + auto result = cl_uint(0); + CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr)); + return static_cast(result); + } template std::vector GetInfoVector(const cl_device_info info) const { auto bytes = size_t{0}; - clGetDeviceInfo(device_, info, 0, nullptr, &bytes); + CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); auto result = std::vector(bytes/sizeof(T)); - clGetDeviceInfo(device_, info, bytes, result.data(), nullptr); + CheckError(clGetDeviceInfo(device_, info, bytes, result.data(), nullptr)); return result; } std::string GetInfoString(const cl_device_info info) const { auto bytes = size_t{0}; - clGetDeviceInfo(device_, info, 0, nullptr, &bytes); - auto result = std::vector(bytes); - clGetDeviceInfo(device_, info, bytes, result.data(), nullptr); - return std::string(result.data()); + CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr)); + return std::string{result.c_str()}; } - - cl_device_id device_; }; // ================================================================================================= -// C++11 version of cl_context -class Context: public ObjectWithState { +// C++11 version of 'cl_context' +class Context { public: - // Constructor based on the plain C data-type - explicit Context(const cl_context context): context_(context) { - clRetainContext(context_); + // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere + explicit Context(const cl_context context): + context_(new cl_context) { + *context_ = context; } - // Memory management - explicit Context(const Device &device) { + // Regular constructor with memory management + explicit Context(const Device &device): + context_(new cl_context, [](cl_context* c) { CheckError(clReleaseContext(*c)); delete c; }) { auto status = CL_SUCCESS; const cl_device_id dev = device(); - context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status); - if (status != CL_SUCCESS) { Error(status); } - } - ~Context() { - clReleaseContext(context_); - } - Context(const Context &other): - context_(other.context_) { - clRetainContext(context_); - } - Context& operator=(Context other) { - swap(*this, other); - return *this; - } - friend void swap(Context &first, Context &second) { - std::swap(first.context_, second.context_); + *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status); + CheckError(status); } - // Accessors to the private data-member - const cl_context& operator()() const { return context_; } + // Accessor to the private data-member + const cl_context& operator()() const { return *context_; } private: - cl_context context_; + std::shared_ptr context_; }; // ================================================================================================= -// C++11 version of cl_program. Additionally holds the program's source code. -class Program: public ObjectWithState { - public: - - // Note that there is no constructor based on the plain C data-type because of extra state +// Enumeration of build statuses of the run-time compilation process +enum class BuildStatus { kSuccess, kError, kInvalid }; - // Memory management - explicit Program(const Context &context, const std::string &source): - length_(source.length()) { - std::copy(source.begin(), source.end(), back_inserter(source_)); - source_ptr_ = source_.data(); - auto status = CL_SUCCESS; - program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status); - if (status != CL_SUCCESS) { Error(status); } - } - ~Program() { - clReleaseProgram(program_); - } - Program(const Program &other): - length_(other.length_), - source_(other.source_), - source_ptr_(other.source_ptr_), - program_(other.program_) { - clRetainProgram(program_); - } - Program& operator=(Program other) { - swap(*this, other); - return *this; - } - friend void swap(Program &first, Program &second) { - std::swap(first.length_, second.length_); - std::swap(first.source_, second.source_); - std::swap(first.source_ptr_, second.source_ptr_); - std::swap(first.program_, second.program_); +// C++11 version of 'cl_program'. Additionally holds the program's source code. +class Program { + public: + // Note that there is no constructor based on the regular OpenCL data-type because of extra state + + // Regular constructor with memory management + explicit Program(const Context &context, std::string source): + program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), + length_(source.length()), + source_(std::move(source)), + source_ptr_(&source_[0]) { + auto status = CL_SUCCESS; + *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status); + CheckError(status); } - // Public functions - cl_int Build(const Device &device, const std::string &options) { + // Compiles the device program and returns whether or not there where any warnings/errors + BuildStatus Build(const Device &device, std::vector &options) { + auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "}); const cl_device_id dev = device(); - return clBuildProgram(program_, 1, &dev, options.c_str(), nullptr, nullptr); + auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr); + if (status == CL_BUILD_PROGRAM_FAILURE) { + return BuildStatus::kError; + } + else if (status == CL_INVALID_BINARY) { + return BuildStatus::kInvalid; + } + else { + CheckError(status); + return BuildStatus::kSuccess; + } } + + // Retrieves the warning/error message from the compiler (if any) std::string GetBuildInfo(const Device &device) const { auto bytes = size_t{0}; - clGetProgramBuildInfo(program_, device(), CL_PROGRAM_BUILD_LOG, 0, nullptr, &bytes); - auto result = std::vector(bytes); - clGetProgramBuildInfo(program_, device(), CL_PROGRAM_BUILD_LOG, bytes, result.data(), nullptr); - return std::string(result.data()); + auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG}; + CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr)); + return result; + } + + // Retrieves an intermediate representation of the compiled program + std::string GetIR() const { + auto bytes = size_t{0}; + CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr)); + auto result = std::string{}; + result.resize(bytes); + auto result_ptr = result.data(); + CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr)); + return result; } - // Accessors to the private data-member - const cl_program& operator()() const { return program_; } + // Accessor to the private data-member + const cl_program& operator()() const { return *program_; } private: + std::shared_ptr program_; size_t length_; - std::vector source_; + std::string source_; const char* source_ptr_; - cl_program program_; }; // ================================================================================================= -// C++11 version of cl_kernel -class Kernel: public ObjectWithState { +// C++11 version of 'cl_command_queue' +class Queue { public: - // Constructor based on the plain C data-type - explicit Kernel(const cl_kernel kernel): kernel_(kernel) { - clRetainKernel(kernel_); + // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere + explicit Queue(const cl_command_queue queue): + queue_(new cl_command_queue) { + *queue_ = queue; } - // Memory management - explicit Kernel(const Program &program, const std::string &name) { + // Regular constructor with memory management + explicit Queue(const Context &context, const Device &device): + queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s)); + delete s; }) { auto status = CL_SUCCESS; - kernel_ = clCreateKernel(program(), name.c_str(), &status); - if (status != CL_SUCCESS) { Error(status); } + *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); + CheckError(status); } - ~Kernel() { - clReleaseKernel(kernel_); + + // Synchronizes the queue + void Finish(Event &) const { + Finish(); } - Kernel(const Kernel &other): - kernel_(other.kernel_) { - clRetainKernel(kernel_); + void Finish() const { + CheckError(clFinish(*queue_)); } - Kernel& operator=(Kernel other) { - swap(*this, other); - return *this; + + // Retrieves the corresponding context or device + Context GetContext() const { + auto bytes = size_t{0}; + CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes)); + cl_context result; + CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr)); + return Context(result); } - friend void swap(Kernel &first, Kernel &second) { - std::swap(first.kernel_, second.kernel_); + Device GetDevice() const { + auto bytes = size_t{0}; + CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes)); + cl_device_id result; + CheckError(clGetCommandQueueInfo(*queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr)); + return Device(result); } - // Public functions - template // Note: doesn't work with T=Buffer - cl_int SetArgument(const cl_uint index, const T &value) { - return clSetKernelArg(kernel_, index, sizeof(T), &value); + // Accessor to the private data-member + const cl_command_queue& operator()() const { return *queue_; } + private: + std::shared_ptr queue_; +}; + +// ================================================================================================= + +// C++11 version of host memory +template +class BufferHost { + public: + + // Regular constructor with memory management + explicit BufferHost(const Context &, const size_t size): + buffer_(new std::vector(size)) { } - size_t LocalMemUsage(const Device &device) const { - auto bytes = size_t{0}; - clGetKernelWorkGroupInfo(kernel_, device(), CL_KERNEL_LOCAL_MEM_SIZE, 0, nullptr, &bytes); - auto result = size_t{0}; - clGetKernelWorkGroupInfo(kernel_, device(), CL_KERNEL_LOCAL_MEM_SIZE, bytes, &result, nullptr); - return result; + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + return buffer_->size()*sizeof(T); } - // Accessors to the private data-member - const cl_kernel& operator()() const { return kernel_; } + // Compatibility with std::vector + size_t size() const { return buffer_->size(); } + T* begin() { return &(*buffer_)[0]; } + T* end() { return &(*buffer_)[buffer_->size()-1]; } + T& operator[](const size_t i) { return (*buffer_)[i]; } + T* data() { return buffer_->data(); } + const T* data() const { return buffer_->data(); } + private: - cl_kernel kernel_; + std::shared_ptr> buffer_; }; // ================================================================================================= -// C++11 version of cl_command_queue -class CommandQueue: public ObjectWithState { +// Enumeration of buffer access types +enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite }; + +// C++11 version of 'cl_mem' +template +class Buffer { public: - // Constructor based on the plain C data-type - explicit CommandQueue(const cl_command_queue queue): queue_(queue) { - clRetainCommandQueue(queue_); + // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere + explicit Buffer(const cl_mem buffer): + buffer_(new cl_mem), + access_(BufferAccess::kReadWrite) { + *buffer_ = buffer; } - // Memory management - explicit CommandQueue(const Context &context, const Device &device) { + // Regular constructor with memory management + explicit Buffer(const Context &context, const BufferAccess access, const size_t size): + buffer_(new cl_mem, [](cl_mem* m) { CheckError(clReleaseMemObject(*m)); delete m; }), + access_(access) { + auto flags = cl_mem_flags{CL_MEM_READ_WRITE}; + if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; } + if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; } auto status = CL_SUCCESS; - queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); - if (status != CL_SUCCESS) { Error(status); } + *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status); + CheckError(status); } - ~CommandQueue() { - clReleaseCommandQueue(queue_); + + // As above, but now with read/write access as a default + explicit Buffer(const Context &context, const size_t size): + Buffer(context, BufferAccess::kReadWrite, size) { } - CommandQueue(const CommandQueue &other): - queue_(other.queue_) { - clRetainCommandQueue(queue_); + + // Copies from device to host: reading the device buffer a-synchronously + void ReadAsync(const Queue &queue, const size_t size, T* host) { + if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); } + CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), host, 0, + nullptr, nullptr)); } - CommandQueue& operator=(CommandQueue other) { - swap(*this, other); - return *this; + void ReadAsync(const Queue &queue, const size_t size, std::vector &host) { + if (host.size() < size) { Error("target host buffer is too small"); } + ReadAsync(queue, size, host.data()); } - friend void swap(CommandQueue &first, CommandQueue &second) { - std::swap(first.queue_, second.queue_); + void ReadAsync(const Queue &queue, const size_t size, BufferHost &host) { + if (host.size() < size) { Error("target host buffer is too small"); } + ReadAsync(queue, size, host.data()); } - // Public functions - cl_int EnqueueKernel(const Kernel &kernel, const std::vector &global, - const std::vector &local, Event &event) { - return clEnqueueNDRangeKernel(queue_, kernel(), static_cast(global.size()), nullptr, - global.data(), local.data(), 0, nullptr, &(event())); + // Copies from device to host: reading the device buffer + void Read(const Queue &queue, const size_t size, T* host) { + ReadAsync(queue, size, host); + queue.Finish(); } - Context GetContext() const { - auto bytes = size_t{0}; - clGetCommandQueueInfo(queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes); - cl_context result; - clGetCommandQueueInfo(queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr); - return Context(result); + void Read(const Queue &queue, const size_t size, std::vector &host) { + Read(queue, size, host.data()); } - Device GetDevice() const { - auto bytes = size_t{0}; - clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes); - cl_device_id result; - clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr); - return Device(result); + void Read(const Queue &queue, const size_t size, BufferHost &host) { + Read(queue, size, host.data()); + } + + // Copies from host to device: writing the device buffer a-synchronously + void WriteAsync(const Queue &queue, const size_t size, const T* host) { + if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); } + if (GetSize() < size*sizeof(T)) { Error("target device buffer is too small"); } + CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), host, 0, + nullptr, nullptr)); + } + void WriteAsync(const Queue &queue, const size_t size, const std::vector &host) { + WriteAsync(queue, size, host.data()); } - cl_int Finish() { - return clFinish(queue_); + void WriteAsync(const Queue &queue, const size_t size, const BufferHost &host) { + WriteAsync(queue, size, host.data()); } - // Accessors to the private data-member - const cl_command_queue& operator()() const { return queue_; } + // Copies from host to device: writing the device buffer + void Write(const Queue &queue, const size_t size, const T* host) { + WriteAsync(queue, size, host); + queue.Finish(); + } + void Write(const Queue &queue, const size_t size, const std::vector &host) { + Write(queue, size, host.data()); + } + void Write(const Queue &queue, const size_t size, const BufferHost &host) { + Write(queue, size, host.data()); + } + + // Copies the contents of this buffer into another device buffer + void CopyToAsync(const Queue &queue, const size_t size, const Buffer &destination) { + CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0, + nullptr, nullptr)); + } + void CopyTo(const Queue &queue, const size_t size, const Buffer &destination) { + CopyToAsync(queue, size, destination); + queue.Finish(); + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + auto bytes = size_t{0}; + CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes)); + auto result = size_t{0}; + CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr)); + return result; + } + + // Accessor to the private data-member + const cl_mem& operator()() const { return *buffer_; } private: - cl_command_queue queue_; + std::shared_ptr buffer_; + const BufferAccess access_; }; // ================================================================================================= -// C++11 version of cl_mem -class Buffer: public ObjectWithState { +// C++11 version of 'cl_kernel' +class Kernel { public: - // Constructor based on the plain C data-type - explicit Buffer(const cl_mem buffer): buffer_(buffer) { - clRetainMemObject(buffer_); + // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere + explicit Kernel(const cl_kernel kernel): + kernel_(new cl_kernel) { + *kernel_ = kernel; } - // Memory management - explicit Buffer(const Context &context, const cl_mem_flags flags, const size_t bytes) { + // Regular constructor with memory management + explicit Kernel(const Program &program, const std::string &name): + kernel_(new cl_kernel, [](cl_kernel* k) { CheckError(clReleaseKernel(*k)); delete k; }) { auto status = CL_SUCCESS; - buffer_ = clCreateBuffer(context(), flags, bytes, nullptr, &status); - if (status != CL_SUCCESS) { Error(status); } - } - ~Buffer() { - clReleaseMemObject(buffer_); - } - Buffer(const Buffer &other): - buffer_(other.buffer_) { - clRetainMemObject(buffer_); - } - Buffer& operator=(Buffer other) { - swap(*this, other); - return *this; - } - friend void swap(Buffer &first, Buffer &second) { - std::swap(first.buffer_, second.buffer_); + *kernel_ = clCreateKernel(program(), name.c_str(), &status); + CheckError(status); } - // Public functions + // Sets a kernel argument at the indicated position template - cl_int ReadBuffer(const CommandQueue &queue, const size_t bytes, T* host) { - return clEnqueueReadBuffer(queue(), buffer_, CL_TRUE, 0, bytes, host, 0, nullptr, nullptr); + void SetArgument(const size_t index, const T &value) { + CheckError(clSetKernelArg(*kernel_, static_cast(index), sizeof(T), &value)); } template - cl_int ReadBuffer(const CommandQueue &queue, const size_t bytes, std::vector &host) { - return ReadBuffer(queue, bytes, host.data()); + void SetArgument(const size_t index, Buffer &value) { + SetArgument(index, value()); } - template - cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, const T* host) { - return clEnqueueWriteBuffer(queue(), buffer_, CL_TRUE, 0, bytes, host, 0, nullptr, nullptr); - } - template - cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, const std::vector &host) { - return WriteBuffer(queue, bytes, &host[0]); + + // Sets all arguments in one go using parameter packs. Note that this overwrites previously set + // arguments using 'SetArgument' or 'SetArguments'. + template + void SetArguments(Args&... args) { + SetArgumentsRecursive(0, args...); } - size_t GetSize() const { + + // Retrieves the amount of local memory used per work-group for this kernel + size_t LocalMemUsage(const Device &device) const { auto bytes = size_t{0}; - auto status = clGetMemObjectInfo(buffer_, CL_MEM_SIZE, 0, nullptr, &bytes); - if (status != CL_SUCCESS) { Error(status); } + auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE}; + CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes)); auto result = size_t{0}; - status = clGetMemObjectInfo(buffer_, CL_MEM_SIZE, bytes, &result, nullptr); - if (status != CL_SUCCESS) { Error(status); } + CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr)); return result; } - // Accessors to the private data-member - const cl_mem& operator()() const { return buffer_; } + // Launches a kernel onto the specified queue + void Launch(const Queue &queue, const std::vector &global, + const std::vector &local, Event &event) { + CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), + nullptr, global.data(), local.data(), + 0, nullptr, &(event()))); + } + + // Accessor to the private data-member + const cl_kernel& operator()() const { return *kernel_; } private: - cl_mem buffer_; + std::shared_ptr kernel_; + + // Internal implementation for the recursive SetArguments function. + template + void SetArgumentsRecursive(const size_t index, T &first) { + SetArgument(index, first); + } + template + void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { + SetArgument(index, first); + SetArgumentsRecursive(index+1, args...); + } }; // ================================================================================================= diff --git a/include/internal/database.h b/include/internal/database.h index 33ad1979..8c937e34 100644 --- a/include/internal/database.h +++ b/include/internal/database.h @@ -39,7 +39,7 @@ class Database { const Parameters parameters; }; struct DatabaseVendor { - const cl_device_type type; + const std::string type; const std::string name; const std::vector devices; }; @@ -49,8 +49,21 @@ class Database { const std::vector vendors; }; - // The default vendor or device - static constexpr auto kDefault = "Default"; + // The OpenCL device types + static constexpr auto kDeviceTypeCPU = "CPU"; + static constexpr auto kDeviceTypeGPU = "GPU"; + static constexpr auto kDeviceTypeAccelerator = "accelerator"; + static constexpr auto kDeviceTypeAll = "default"; + + // The OpenCL device vendors + static constexpr auto kDeviceVendorNVIDIA = "NVIDIA Corporation"; + static constexpr auto kDeviceVendorAMD = "Advanced Micro Devices, Inc."; + static constexpr auto kDeviceVendorIntel = "Intel"; + static constexpr auto kDeviceVendorAll = "default"; + + // The OpenCL device names + static constexpr auto kDefaultDevice = "default"; + // The database consists of separate database entries, stored together in a vector static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble; @@ -63,7 +76,7 @@ class Database { static const std::vector database; // The constructor - explicit Database(const CommandQueue &queue, const std::vector &routines, + explicit Database(const Queue &queue, const std::vector &routines, const Precision precision); // Accessor of values by key @@ -73,13 +86,10 @@ class Database { std::string GetDefines() const; private: - Parameters Search(const std::string &this_kernel, const cl_device_type this_type, + Parameters Search(const std::string &this_kernel, const std::string &this_type, const std::string &this_vendor, const std::string &this_device, const Precision this_precision) const; - // Tests equality between a database-vendor string and an OpenCL vendor string - bool VendorEqual(const std::string &db_vendor, const std::string &cl_vendor) const; - // Found parameters suitable for this device/kernel Parameters parameters_; }; diff --git a/include/internal/database/copy.h b/include/internal/database/copy.h index dfd69b80..541a352b 100644 --- a/include/internal/database/copy.h +++ b/include/internal/database/copy.h @@ -17,25 +17,25 @@ namespace clblast { const Database::DatabaseEntry Database::CopySingle = { "Copy", Precision::kSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_WPT",2}, {"COPY_VW",4} } }, { "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",4}, {"COPY_VW",4} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",4} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, } }, } @@ -46,24 +46,24 @@ const Database::DatabaseEntry Database::CopySingle = { const Database::DatabaseEntry Database::CopyDouble = { "Copy", Precision::kDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, { "Tesla K20m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } }, { "Tesla K40m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, } }, } @@ -74,25 +74,25 @@ const Database::DatabaseEntry Database::CopyDouble = { const Database::DatabaseEntry Database::CopyComplexSingle = { "Copy", Precision::kComplexSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",1}, {"COPY_VW",1} } }, { "Tesla K20m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",1} } }, { "Tesla K40m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, } }, } @@ -103,24 +103,24 @@ const Database::DatabaseEntry Database::CopyComplexSingle = { const Database::DatabaseEntry Database::CopyComplexDouble = { "Copy", Precision::kComplexDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",1}, {"COPY_VW",1} } }, { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } }, } }, } diff --git a/include/internal/database/pad.h b/include/internal/database/pad.h index 61ec3242..4a599648 100644 --- a/include/internal/database/pad.h +++ b/include/internal/database/pad.h @@ -17,25 +17,25 @@ namespace clblast { const Database::DatabaseEntry Database::PadSingle = { "Pad", Precision::kSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, { "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, } @@ -46,24 +46,24 @@ const Database::DatabaseEntry Database::PadSingle = { const Database::DatabaseEntry Database::PadDouble = { "Pad", Precision::kDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, } @@ -74,25 +74,25 @@ const Database::DatabaseEntry Database::PadDouble = { const Database::DatabaseEntry Database::PadComplexSingle = { "Pad", Precision::kComplexSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, } @@ -103,24 +103,24 @@ const Database::DatabaseEntry Database::PadComplexSingle = { const Database::DatabaseEntry Database::PadComplexDouble = { "Pad", Precision::kComplexDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, } diff --git a/include/internal/database/padtranspose.h b/include/internal/database/padtranspose.h index 8f6fcba0..53226c1d 100644 --- a/include/internal/database/padtranspose.h +++ b/include/internal/database/padtranspose.h @@ -17,25 +17,25 @@ namespace clblast { const Database::DatabaseEntry Database::PadTraSingle = { "PadTranspose", Precision::kSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } }, { "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } }, { "Tesla K40m", { {"PADTRA_TILE",32}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, } }, } @@ -46,24 +46,24 @@ const Database::DatabaseEntry Database::PadTraSingle = { const Database::DatabaseEntry Database::PadTraDouble = { "PadTranspose", Precision::kDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, { "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, { "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, } }, } @@ -74,25 +74,25 @@ const Database::DatabaseEntry Database::PadTraDouble = { const Database::DatabaseEntry Database::PadTraComplexSingle = { "PadTranspose", Precision::kComplexSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, { "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, { "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, } }, } @@ -103,24 +103,24 @@ const Database::DatabaseEntry Database::PadTraComplexSingle = { const Database::DatabaseEntry Database::PadTraComplexDouble = { "PadTranspose", Precision::kComplexDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, { "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, { "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } }, } }, } diff --git a/include/internal/database/transpose.h b/include/internal/database/transpose.h index b348f364..1d12a13e 100644 --- a/include/internal/database/transpose.h +++ b/include/internal/database/transpose.h @@ -17,25 +17,25 @@ namespace clblast { const Database::DatabaseEntry Database::TraSingle = { "Transpose", Precision::kSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } }, } }, } @@ -46,24 +46,24 @@ const Database::DatabaseEntry Database::TraSingle = { const Database::DatabaseEntry Database::TraDouble = { "Transpose", Precision::kDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } }, } }, } @@ -74,25 +74,25 @@ const Database::DatabaseEntry Database::TraDouble = { const Database::DatabaseEntry Database::TraComplexSingle = { "Transpose", Precision::kComplexSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } }, } }, } @@ -103,24 +103,24 @@ const Database::DatabaseEntry Database::TraComplexSingle = { const Database::DatabaseEntry Database::TraComplexDouble = { "Transpose", Precision::kComplexDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } }, } }, } diff --git a/include/internal/database/xaxpy.h b/include/internal/database/xaxpy.h index 40747678..058e3c0a 100644 --- a/include/internal/database/xaxpy.h +++ b/include/internal/database/xaxpy.h @@ -17,25 +17,25 @@ namespace clblast { const Database::DatabaseEntry Database::XaxpySingle = { "Xaxpy", Precision::kSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"WGS",128}, {"WPT",1}, {"VW",2} } }, { "Tesla K20m", { {"WGS",128}, {"WPT",2}, {"VW",2} } }, { "Tesla K40m", { {"WGS",128}, {"WPT",1}, {"VW",4} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",2} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"WGS",512}, {"WPT",1}, {"VW",1} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"WGS",128}, {"WPT",1}, {"VW",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"WGS",128}, {"WPT",1}, {"VW",1} } }, } }, } @@ -46,24 +46,24 @@ const Database::DatabaseEntry Database::XaxpySingle = { const Database::DatabaseEntry Database::XaxpyDouble = { "Xaxpy", Precision::kDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"WGS",128}, {"WPT",1}, {"VW",1} } }, { "Tesla K20m", { {"WGS",512}, {"WPT",1}, {"VW",2} } }, { "Tesla K40m", { {"WGS",64}, {"WPT",1}, {"VW",2} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"WGS",256}, {"WPT",1}, {"VW",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"WGS",128}, {"WPT",1}, {"VW",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"WGS",128}, {"WPT",1}, {"VW",1} } }, } }, } @@ -73,25 +73,25 @@ const Database::DatabaseEntry Database::XaxpyDouble = { const Database::DatabaseEntry Database::XaxpyComplexSingle = { "Xaxpy", Precision::kComplexSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"WGS",256}, {"WPT",1}, {"VW",1} } }, { "Tesla K20m", { {"WGS",128}, {"WPT",1}, {"VW",1} } }, { "Tesla K40m", { {"WGS",128}, {"WPT",2}, {"VW",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"WGS",256}, {"WPT",1}, {"VW",1} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"WGS",128}, {"WPT",1}, {"VW",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"WGS",128}, {"WPT",1}, {"VW",1} } }, } }, } @@ -102,24 +102,24 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = { const Database::DatabaseEntry Database::XaxpyComplexDouble = { "Xaxpy", Precision::kComplexDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"WGS",128}, {"WPT",2}, {"VW",1} } }, { "Tesla K20m", { {"WGS",256}, {"WPT",1}, {"VW",1} } }, { "Tesla K40m", { {"WGS",64}, {"WPT",2}, {"VW",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"WGS",128}, {"WPT",1}, {"VW",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"WGS",128}, {"WPT",1}, {"VW",1} } }, } }, } diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h index c2fe9bcb..49598c8c 100644 --- a/include/internal/database/xgemm.h +++ b/include/internal/database/xgemm.h @@ -17,26 +17,26 @@ namespace clblast { const Database::DatabaseEntry Database::XgemmSingle = { "Xgemm", Precision::kSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"MWG",128}, {"NWG",64}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, { "Tesla K20m", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",4}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, { "Tesla K40m", { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, - { kDefault, { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, + { kDefaultDevice, { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"MWG",128}, {"NWG",128}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",8}, {"KWI",2}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, } }, } @@ -47,25 +47,25 @@ const Database::DatabaseEntry Database::XgemmSingle = { const Database::DatabaseEntry Database::XgemmDouble = { "Xgemm", Precision::kDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, { "Tesla K20m", { {"MWG",64}, {"NWG",128}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",32}, {"KWI",8}, {"VWM",2}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } }, { "Tesla K40m", { {"MWG",64}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } }, - { kDefault, { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } }, + { kDefaultDevice, { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, } }, } @@ -76,26 +76,26 @@ const Database::DatabaseEntry Database::XgemmDouble = { const Database::DatabaseEntry Database::XgemmComplexSingle = { "Xgemm", Precision::kComplexSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } }, { "Tesla K20m", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",8}, {"KWI",8}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } }, { "Tesla K40m", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",0}, {"STRN",1}, {"SA",1}, {"SB",1} } }, - { kDefault, { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } }, + { kDefaultDevice, { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, } }, } @@ -106,25 +106,25 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = { const Database::DatabaseEntry Database::XgemmComplexDouble = { "Xgemm", Precision::kComplexDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } }, { "Tesla K20m", { {"MWG",16}, {"NWG",128}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",8}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } }, { "Tesla K40m", { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",1} } }, - { kDefault, { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } }, + { kDefaultDevice, { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } }, } }, } diff --git a/include/internal/database/xgemv.h b/include/internal/database/xgemv.h index 0266dd3c..c315500f 100644 --- a/include/internal/database/xgemv.h +++ b/include/internal/database/xgemv.h @@ -17,25 +17,25 @@ namespace clblast { const Database::DatabaseEntry Database::XgemvSingle = { "Xgemv", Precision::kSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, { "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, { "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"WGS2",256}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",4} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"WGS1",256}, {"WPT1",2}, {"WGS2",64}, {"WPT2",4}, {"VW2",4}, {"WGS3",256}, {"WPT3",2}, {"VW3",8} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, } }, } @@ -46,24 +46,24 @@ const Database::DatabaseEntry Database::XgemvSingle = { const Database::DatabaseEntry Database::XgemvDouble = { "Xgemv", Precision::kDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, { "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, { "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, } }, } @@ -73,25 +73,25 @@ const Database::DatabaseEntry Database::XgemvDouble = { const Database::DatabaseEntry Database::XgemvComplexSingle = { "Xgemv", Precision::kComplexSingle, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, { "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, { "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { { "Iris", { {"WGS1",256}, {"WPT1",1}, {"WGS2",64}, {"WPT2",4}, {"VW2",2}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, } }, } @@ -102,24 +102,24 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = { const Database::DatabaseEntry Database::XgemvComplexDouble = { "Xgemv", Precision::kComplexDouble, { { // NVIDIA GPUs - CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", { + kDeviceTypeGPU, kDeviceVendorNVIDIA, { { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, { "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, { "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, } }, { // AMD GPUs - CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", { + kDeviceTypeGPU, kDeviceVendorAMD, { { "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, } }, { // Intel GPUs - CL_DEVICE_TYPE_GPU, "Intel", { + kDeviceTypeGPU, kDeviceVendorIntel, { } }, { // Default - CL_DEVICE_TYPE_ALL, kDefault, { - { kDefault, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, + kDeviceTypeAll, kDeviceVendorAll, { + { kDefaultDevice, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } }, } }, } diff --git a/include/internal/routine.h b/include/internal/routine.h index 911bda49..367917fd 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -26,6 +26,7 @@ namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class +template class Routine { public: @@ -52,7 +53,7 @@ class Routine { static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); } // Base class constructor - explicit Routine(CommandQueue &queue, Event &event, const std::string &name, + explicit Routine(Queue &queue, Event &event, const std::string &name, const std::vector &routines, const Precision precision); // Set-up phase of the kernel @@ -61,31 +62,31 @@ class Routine { protected: // Runs a kernel given the global and local thread sizes - StatusCode RunKernel(const Kernel &kernel, std::vector &global, + StatusCode RunKernel(Kernel &kernel, std::vector &global, const std::vector &local); // Tests for valid inputs of matrices A, B, and C - StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer &buffer, + StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer &buffer, const size_t offset, const size_t ld, const size_t data_size); - StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer &buffer, + StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer &buffer, const size_t offset, const size_t ld, const size_t data_size); - StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer &buffer, + StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer &buffer, const size_t offset, const size_t ld, const size_t data_size); // Tests for valid inputs of vectors X and Y - StatusCode TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, + StatusCode TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, const size_t inc, const size_t data_size); - StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, + StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, const size_t inc, const size_t data_size); // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write // to symmetric and triangular matrices through optional arguments. StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, - const Buffer &src, + const Buffer &src, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const size_t dest_offset, - const Buffer &dest, + const Buffer &dest, const Program &program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const bool upper = false, const bool lower = false, @@ -106,14 +107,14 @@ class Routine { std::string source_string_; // The OpenCL objects, accessible only from derived classes - CommandQueue queue_; + Queue queue_; Event event_; const Context context_; const Device device_; // OpenCL device properties const std::string device_name_; - const cl_uint max_work_item_dimensions_; + const size_t max_work_item_dimensions_; const std::vector max_work_item_sizes_; const size_t max_work_group_size_; diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h index e548e553..4b9da890 100644 --- a/include/internal/routines/level1/xaxpy.h +++ b/include/internal/routines/level1/xaxpy.h @@ -21,14 +21,26 @@ namespace clblast { // See comment at top of file for a description of the class template -class Xaxpy: public Routine { +class Xaxpy: public Routine { public: - Xaxpy(CommandQueue &queue, Event &event); + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::GetProgramFromCache; + using Routine::TestVectorX; + using Routine::TestVectorY; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xaxpy(Queue &queue, Event &event); // Templated-precision implementation of the routine StatusCode DoAxpy(const size_t n, const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); private: // Static variable to get the precision diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h index a3109036..5ada9b03 100644 --- a/include/internal/routines/level2/xgemv.h +++ b/include/internal/routines/level2/xgemv.h @@ -21,18 +21,31 @@ namespace clblast { // See comment at top of file for a description of the class template -class Xgemv: public Routine { +class Xgemv: public Routine { public: - Xgemv(CommandQueue &queue, Event &event); + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::GetProgramFromCache; + using Routine::TestVectorX; + using Routine::TestVectorY; + using Routine::TestMatrixA; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xgemv(Queue &queue, Event &event); // Templated-precision implementation of the routine StatusCode DoGemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc); private: // Static variable to get the precision diff --git a/include/internal/routines/level3/xgemm.h b/include/internal/routines/level3/xgemm.h index 7ad4fcfb..a0c8b595 100644 --- a/include/internal/routines/level3/xgemm.h +++ b/include/internal/routines/level3/xgemm.h @@ -21,18 +21,33 @@ namespace clblast { // See comment at top of file for a description of the class template -class Xgemm: public Routine { +class Xgemm: public Routine { public: - Xgemm(CommandQueue &queue, Event &event); + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::PadCopyTransposeMatrix; + using Routine::TestMatrixA; + using Routine::TestMatrixB; + using Routine::TestMatrixC; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xgemm(Queue &queue, Event &event); // Templated-precision implementation of the routine StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); private: // Static variable to get the precision diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h index 6cc9d9ec..5f1e8723 100644 --- a/include/internal/routines/level3/xhemm.h +++ b/include/internal/routines/level3/xhemm.h @@ -25,30 +25,28 @@ template class Xhemm: public Xgemm { public: - // Uses several variables from the Routine class - using Routine::db_; - using Routine::context_; - - // Uses several helper functions from the Routine class - using Routine::RunKernel; - using Routine::ErrorIn; - using Routine::TestMatrixA; - using Routine::GetProgramFromCache; + // Members and methods from the base class + using Routine::db_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::TestMatrixA; + using Routine::RunKernel; + using Routine::ErrorIn; // Uses the regular Xgemm routine using Xgemm::DoGemm; // Constructor - Xhemm(CommandQueue &queue, Event &event); + Xhemm(Queue &queue, Event &event); // Templated-precision implementation of the routine StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h index 1836a812..9e961d23 100644 --- a/include/internal/routines/level3/xher2k.h +++ b/include/internal/routines/level3/xher2k.h @@ -23,18 +23,33 @@ namespace clblast { // See comment at top of file for a description of the class template -class Xher2k: public Routine { +class Xher2k: public Routine { public: - Xher2k(CommandQueue &queue, Event &event); + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::PadCopyTransposeMatrix; + using Routine::TestMatrixA; + using Routine::TestMatrixB; + using Routine::TestMatrixC; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xher2k(Queue &queue, Event &event); // Templated-precision implementation of the routine StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); private: // Static variable to get the precision diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h index 9b361254..f285a71c 100644 --- a/include/internal/routines/level3/xherk.h +++ b/include/internal/routines/level3/xherk.h @@ -23,17 +23,31 @@ namespace clblast { // See comment at top of file for a description of the class template -class Xherk: public Routine { +class Xherk: public Routine { public: - Xherk(CommandQueue &queue, Event &event); + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::PadCopyTransposeMatrix; + using Routine::TestMatrixA; + using Routine::TestMatrixC; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xherk(Queue &queue, Event &event); // Templated-precision implementation of the routine StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const U alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); private: // Static variable to get the precision diff --git a/include/internal/routines/level3/xsymm.h b/include/internal/routines/level3/xsymm.h index 2028ceea..9ed3c722 100644 --- a/include/internal/routines/level3/xsymm.h +++ b/include/internal/routines/level3/xsymm.h @@ -27,30 +27,28 @@ template class Xsymm: public Xgemm { public: - // Uses several variables from the Routine class - using Routine::db_; - using Routine::context_; - - // Uses several helper functions from the Routine class - using Routine::RunKernel; - using Routine::ErrorIn; - using Routine::TestMatrixA; - using Routine::GetProgramFromCache; + // Members and methods from the base class + using Routine::db_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::TestMatrixA; + using Routine::RunKernel; + using Routine::ErrorIn; // Uses the regular Xgemm routine using Xgemm::DoGemm; // Constructor - Xsymm(CommandQueue &queue, Event &event); + Xsymm(Queue &queue, Event &event); // Templated-precision implementation of the routine StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); }; // ================================================================================================= diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h index 6259313c..85936658 100644 --- a/include/internal/routines/level3/xsyr2k.h +++ b/include/internal/routines/level3/xsyr2k.h @@ -23,18 +23,33 @@ namespace clblast { // See comment at top of file for a description of the class template -class Xsyr2k: public Routine { +class Xsyr2k: public Routine { public: - Xsyr2k(CommandQueue &queue, Event &event); + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::PadCopyTransposeMatrix; + using Routine::TestMatrixA; + using Routine::TestMatrixB; + using Routine::TestMatrixC; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xsyr2k(Queue &queue, Event &event); // Templated-precision implementation of the routine StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); private: // Static variable to get the precision diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h index 3dab731f..14d51a58 100644 --- a/include/internal/routines/level3/xsyrk.h +++ b/include/internal/routines/level3/xsyrk.h @@ -25,17 +25,31 @@ namespace clblast { // See comment at top of file for a description of the class template -class Xsyrk: public Routine { +class Xsyrk: public Routine { public: - Xsyrk(CommandQueue &queue, Event &event); + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::PadCopyTransposeMatrix; + using Routine::TestMatrixA; + using Routine::TestMatrixC; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xsyrk(Queue &queue, Event &event); // Templated-precision implementation of the routine StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); private: // Static variable to get the precision diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h index 4f49bebd..d8ac60fd 100644 --- a/include/internal/routines/level3/xtrmm.h +++ b/include/internal/routines/level3/xtrmm.h @@ -26,29 +26,27 @@ template class Xtrmm: public Xgemm { public: - // Uses several variables from the Routine class - using Routine::db_; - using Routine::context_; - - // Uses several helper functions from the Routine class - using Routine::RunKernel; - using Routine::ErrorIn; - using Routine::TestMatrixA; - using Routine::GetProgramFromCache; + // Members and methods from the base class + using Routine::db_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::TestMatrixA; + using Routine::RunKernel; + using Routine::ErrorIn; // Uses the regular Xgemm routine using Xgemm::DoGemm; // Constructor - Xtrmm(CommandQueue &queue, Event &event); + Xtrmm(Queue &queue, Event &event); // Templated-precision implementation of the routine StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld); + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld); }; // ================================================================================================= diff --git a/include/internal/utilities.h b/include/internal/utilities.h index 60d70eae..6dba24e1 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -131,12 +131,13 @@ struct Arguments { }; // Structure containing all possible buffers for test clients +template struct Buffers { - Buffer x_vec; - Buffer y_vec; - Buffer a_mat; - Buffer b_mat; - Buffer c_mat; + Buffer x_vec; + Buffer y_vec; + Buffer a_mat; + Buffer b_mat; + Buffer c_mat; }; // ================================================================================================= diff --git a/src/clblast.cc b/src/clblast.cc index 6cb4086e..eddb8022 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -43,7 +43,7 @@ StatusCode Axpy(const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xaxpy(queue_cpp, event_cpp); @@ -53,8 +53,8 @@ StatusCode Axpy(const size_t n, const T alpha, // Runs the routine return routine.DoAxpy(n, alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc); + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); } template StatusCode Axpy(const size_t, const float, const cl_mem, const size_t, const size_t, @@ -85,7 +85,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xgemv(queue_cpp, event_cpp); @@ -95,9 +95,9 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, // Runs the routine return routine.DoGemv(layout, a_transpose, m, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, beta, - Buffer(y_buffer), y_offset, y_inc); + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, beta, + Buffer(y_buffer), y_offset, y_inc); } template StatusCode Gemv(const Layout, const Transpose, const size_t, const size_t, const float, @@ -135,7 +135,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xgemm(queue_cpp, event_cpp); @@ -145,9 +145,9 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos // Runs the routine return routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(c_buffer), c_offset, c_ld); } template StatusCode Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float, @@ -184,7 +184,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsymm(queue_cpp, event_cpp); @@ -194,9 +194,9 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, // Runs the routine return routine.DoSymm(layout, side, triangle, m, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(c_buffer), c_offset, c_ld); } template StatusCode Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const float, @@ -233,7 +233,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xhemm(queue_cpp, event_cpp); @@ -243,9 +243,9 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, // Runs the routine return routine.DoHemm(layout, side, triangle, m, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(c_buffer), c_offset, c_ld); } template StatusCode Hemm(const Layout, const Side, const Triangle, const size_t, const size_t, const float2, @@ -269,7 +269,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsyrk(queue_cpp, event_cpp); @@ -279,8 +279,8 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ // Runs the routine return routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha, - Buffer(a_buffer), a_offset, a_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer(a_buffer), a_offset, a_ld, beta, + Buffer(c_buffer), c_offset, c_ld); } template StatusCode Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, @@ -312,7 +312,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xherk,T>(queue_cpp, event_cpp); @@ -322,8 +322,8 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ // Runs the routine return routine.DoHerk(layout, triangle, a_transpose, n, k, alpha, - Buffer(a_buffer), a_offset, a_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer>(a_buffer), a_offset, a_ld, beta, + Buffer>(c_buffer), c_offset, c_ld); } template StatusCode Herk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, @@ -346,7 +346,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsyr2k(queue_cpp, event_cpp); @@ -356,9 +356,9 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a // Runs the routine return routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(c_buffer), c_offset, c_ld); } template StatusCode Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, @@ -395,7 +395,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xher2k(queue_cpp, event_cpp); @@ -405,9 +405,9 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a // Runs the routine return routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(c_buffer), c_offset, c_ld); } template StatusCode Her2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, @@ -433,7 +433,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xtrmm(queue_cpp, event_cpp); @@ -443,8 +443,8 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, // Runs the routine return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld); + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld); } template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, @@ -483,7 +483,7 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xtrsm(queue_cpp, event_cpp); @@ -493,8 +493,8 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, // Runs the routine return routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld); + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld); } template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, diff --git a/src/database.cc b/src/database.cc index 4d9d844e..258d861e 100644 --- a/src/database.cc +++ b/src/database.cc @@ -39,7 +39,7 @@ const std::vector Database::database = { // ================================================================================================= // Constructor, computing device properties and populating the parameter-vector from the database -Database::Database(const CommandQueue &queue, const std::vector &kernels, +Database::Database(const Queue &queue, const std::vector &kernels, const Precision precision): parameters_{} { @@ -71,7 +71,7 @@ std::string Database::GetDefines() const { // Searches the database for the right kernel and precision Database::Parameters Database::Search(const std::string &this_kernel, - const cl_device_type this_type, + const std::string &this_type, const std::string &this_vendor, const std::string &this_device, const Precision this_precision) const { @@ -81,13 +81,13 @@ Database::Parameters Database::Search(const std::string &this_kernel, // Searches for the right vendor and device type, or selects the default if unavailable. This // assumes that the default vendor / device type is last in the database. for (auto &vendor: db.vendors) { - if (VendorEqual(vendor.name, this_vendor) && - (vendor.type == this_type || vendor.type == CL_DEVICE_TYPE_ALL)) { + if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) && + (vendor.type == this_type || vendor.type == kDeviceTypeAll)) { // Searches for the right device. If the current device is unavailable, selects the vendor // default parameters. This assumes the default is last in the database. for (auto &device: vendor.devices) { - if (device.name == this_device || device.name == kDefault) { + if (device.name == this_device || device.name == kDefaultDevice) { // Sets the parameters accordingly return device.parameters; @@ -102,13 +102,5 @@ Database::Parameters Database::Search(const std::string &this_kernel, throw std::runtime_error("Database error, could not find a suitable entry"); } -// Determines the equality between two vendor names. This is implemented because vendor names can -// be ambigious and might change between different SDK or driver versions. -bool Database::VendorEqual(const std::string &db_vendor, const std::string &cl_vendor) const { - if (db_vendor == kDefault) { return true; } - if (db_vendor == cl_vendor) { return true; } - return false; -} - // ================================================================================================= } // namespace clblast diff --git a/src/routine.cc b/src/routine.cc index aded1a31..31476c42 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -13,17 +13,17 @@ #include "internal/routine.h" -#include "internal/utilities.h" - namespace clblast { // ================================================================================================= // The cache of compiled OpenCL programs -std::vector Routine::program_cache_; +template +std::vector::ProgramCache> Routine::program_cache_; // Constructor: not much here, because no status codes can be returned -Routine::Routine(CommandQueue &queue, Event &event, const std::string &name, - const std::vector &routines, const Precision precision): +template +Routine::Routine(Queue &queue, Event &event, const std::string &name, + const std::vector &routines, const Precision precision): precision_(precision), routine_name_(name), queue_(queue), @@ -40,14 +40,15 @@ Routine::Routine(CommandQueue &queue, Event &event, const std::string &name, // ================================================================================================= // Separate set-up function to allow for status codes to be returned -StatusCode Routine::SetUp() { +template +StatusCode Routine::SetUp() { // Queries the cache to see whether or not the compiled kernel is already there. If not, it will // be built and added to the cache. if (!ProgramIsInCache()) { // Inspects whether or not cl_khr_fp64 is supported in case of double precision - auto extensions = device_.Extensions(); + auto extensions = device_.Capabilities(); if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) { if (extensions.find(kKhronosDoublePrecision) == std::string::npos) { return StatusCode::kNoDoublePrecision; @@ -85,16 +86,16 @@ StatusCode Routine::SetUp() { // Compiles the kernel try { auto program = Program(context_, source_string); - auto options = std::string{}; - auto status = program.Build(device_, options); + auto options = std::vector(); + auto build_status = program.Build(device_, options); // Checks for compiler crashes/errors/warnings - if (status == CL_BUILD_PROGRAM_FAILURE) { + if (build_status == BuildStatus::kError) { auto message = program.GetBuildInfo(device_); fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str()); return StatusCode::kBuildProgramFailure; } - if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; } + if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; } // Store the compiled program in the cache program_cache_.push_back({program, device_name_, precision_, routine_name_}); @@ -108,8 +109,9 @@ StatusCode Routine::SetUp() { // ================================================================================================= // Enqueues a kernel, waits for completion, and checks for errors -StatusCode Routine::RunKernel(const Kernel &kernel, std::vector &global, - const std::vector &local) { +template +StatusCode Routine::RunKernel(Kernel &kernel, std::vector &global, + const std::vector &local) { // Tests for validity of the local thread sizes if (local.size() > max_work_item_dimensions_) { @@ -132,12 +134,14 @@ StatusCode Routine::RunKernel(const Kernel &kernel, std::vector &global, if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; } // Launches the kernel (and checks for launch errors) - auto status = queue_.EnqueueKernel(kernel, global, local, event_); - if (status != CL_SUCCESS) { return StatusCode::kKernelLaunchError; } + try { + kernel.Launch(queue_, global, local, event_); + } catch (...) { return StatusCode::kKernelLaunchError; } // Waits for completion of the kernel - status = event_.Wait(); - if (status != CL_SUCCESS) { return StatusCode::kKernelRunError; } + try { + queue_.Finish(event_); + } catch (...) { return StatusCode::kKernelRunError; } // No errors, normal termination of this function return StatusCode::kSuccess; @@ -147,8 +151,9 @@ StatusCode Routine::RunKernel(const Kernel &kernel, std::vector &global, // Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a // sufficient buffer size. -StatusCode Routine::TestMatrixA(const size_t one, const size_t two, const Buffer &buffer, - const size_t offset, const size_t ld, const size_t data_size) { +template +StatusCode Routine::TestMatrixA(const size_t one, const size_t two, const Buffer &buffer, + const size_t offset, const size_t ld, const size_t data_size) { if (ld < one) { return StatusCode::kInvalidLeadDimA; } try { auto required_size = (ld*two + offset)*data_size; @@ -160,8 +165,9 @@ StatusCode Routine::TestMatrixA(const size_t one, const size_t two, const Buffer // Tests matrix B for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a // sufficient buffer size. -StatusCode Routine::TestMatrixB(const size_t one, const size_t two, const Buffer &buffer, - const size_t offset, const size_t ld, const size_t data_size) { +template +StatusCode Routine::TestMatrixB(const size_t one, const size_t two, const Buffer &buffer, + const size_t offset, const size_t ld, const size_t data_size) { if (ld < one) { return StatusCode::kInvalidLeadDimB; } try { auto required_size = (ld*two + offset)*data_size; @@ -173,8 +179,9 @@ StatusCode Routine::TestMatrixB(const size_t one, const size_t two, const Buffer // Tests matrix C for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a // sufficient buffer size. -StatusCode Routine::TestMatrixC(const size_t one, const size_t two, const Buffer &buffer, - const size_t offset, const size_t ld, const size_t data_size) { +template +StatusCode Routine::TestMatrixC(const size_t one, const size_t two, const Buffer &buffer, + const size_t offset, const size_t ld, const size_t data_size) { if (ld < one) { return StatusCode::kInvalidLeadDimC; } try { auto required_size = (ld*two + offset)*data_size; @@ -188,8 +195,9 @@ StatusCode Routine::TestMatrixC(const size_t one, const size_t two, const Buffer // Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a // sufficient buffer size. -StatusCode Routine::TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, - const size_t inc, const size_t data_size) { +template +StatusCode Routine::TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, + const size_t inc, const size_t data_size) { if (inc == 0) { return StatusCode::kInvalidIncrementX; } try { auto required_size = (n*inc + offset)*data_size; @@ -201,8 +209,9 @@ StatusCode Routine::TestVectorX(const size_t n, const Buffer &buffer, const size // Tests vector Y for validity: checks for a valid increment, a valid OpenCL buffer, and for a // sufficient buffer size. -StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, - const size_t inc, const size_t data_size) { +template +StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, + const size_t inc, const size_t data_size) { if (inc == 0) { return StatusCode::kInvalidIncrementY; } try { auto required_size = (n*inc + offset)*data_size; @@ -215,16 +224,17 @@ StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size // ================================================================================================= // Copies or transposes a matrix and pads/unpads it with zeros -StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, - const size_t src_ld, const size_t src_offset, - const Buffer &src, - const size_t dest_one, const size_t dest_two, - const size_t dest_ld, const size_t dest_offset, - const Buffer &dest, - const Program &program, const bool do_pad, - const bool do_transpose, const bool do_conjugate, - const bool upper, const bool lower, - const bool diagonal_imag_zero) { +template +StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, + const size_t src_ld, const size_t src_offset, + const Buffer &src, + const size_t dest_one, const size_t dest_two, + const size_t dest_ld, const size_t dest_offset, + const Buffer &dest, + const Program &program, const bool do_pad, + const bool do_transpose, const bool do_conjugate, + const bool upper, const bool lower, + const bool diagonal_imag_zero) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && @@ -328,7 +338,8 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr // Queries the cache and retrieves a matching program. Assumes that the match is available, throws // otherwise. -const Program& Routine::GetProgramFromCache() const { +template +const Program& Routine::GetProgramFromCache() const { for (auto &cached_program: program_cache_) { if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return cached_program.program; @@ -338,12 +349,21 @@ const Program& Routine::GetProgramFromCache() const { } // Queries the cache to see whether or not the compiled kernel is already there -bool Routine::ProgramIsInCache() const { +template +bool Routine::ProgramIsInCache() const { for (auto &cached_program: program_cache_) { if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return true; } } return false; } +// ================================================================================================= + +// Compiles the templated class +template class Routine; +template class Routine; +template class Routine; +template class Routine; + // ================================================================================================= } // namespace clblast diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index e6b320d9..7646b0e4 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -29,8 +29,8 @@ template <> const Precision Xaxpy::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xaxpy::Xaxpy(CommandQueue &queue, Event &event): - Routine(queue, event, "AXPY", {"Xaxpy"}, precision_) { +Xaxpy::Xaxpy(Queue &queue, Event &event): + Routine(queue, event, "AXPY", {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/xaxpy.opencl" ; @@ -41,8 +41,8 @@ Xaxpy::Xaxpy(CommandQueue &queue, Event &event): // The main routine template StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { return StatusCode::kInvalidDimension; } diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index a7052af8..75219b63 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -29,8 +29,8 @@ template <> const Precision Xgemv::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xgemv::Xgemv(CommandQueue &queue, Event &event): - Routine(queue, event, "GEMV", {"Xgemv"}, precision_) { +Xgemv::Xgemv(Queue &queue, Event &event): + Routine(queue, event, "GEMV", {"Xgemv"}, precision_) { source_string_ = #include "../../kernels/xgemv.opencl" ; @@ -43,10 +43,10 @@ template StatusCode Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 85524891..525a82e6 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -29,8 +29,8 @@ template <> const Precision Xgemm::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xgemm::Xgemm(CommandQueue &queue, Event &event): - Routine(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xgemm::Xgemm(Queue &queue, Event &event): + Routine(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/copy.opencl" #include "../../kernels/pad.opencl" @@ -48,10 +48,10 @@ StatusCode Xgemm::DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; } @@ -117,9 +117,9 @@ StatusCode Xgemm::DoGemm(const Layout layout, c_do_transpose == false; // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T)); - auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T)); + auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*m_ceiled); + auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, m_ceiled*n_ceiled); // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc index bc257c44..a1c0c7c1 100644 --- a/src/routines/level3/xhemm.cc +++ b/src/routines/level3/xhemm.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xhemm::Xhemm(CommandQueue &queue, Event &event): +Xhemm::Xhemm(Queue &queue, Event &event): Xgemm(queue, event) { } @@ -32,10 +32,10 @@ template StatusCode Xhemm::DoHemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } @@ -56,7 +56,7 @@ StatusCode Xhemm::DoHemm(const Layout layout, const Side side, const Triangle // Temporary buffer for a copy of the hermitian matrix try { - auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + auto temp_herm = Buffer(context_, k*k); // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm // routine afterwards diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index fa42733f..29b2f733 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -27,8 +27,8 @@ template <> const Precision Xher2k::precision_ = Precision::kCom // Constructor: forwards to base class constructor template -Xher2k::Xher2k(CommandQueue &queue, Event &event): - Routine(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xher2k::Xher2k(Queue &queue, Event &event): + Routine(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/copy.opencl" #include "../../kernels/pad.opencl" @@ -45,10 +45,10 @@ template StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } @@ -105,11 +105,11 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co ab_rotated == false && ab_conjugate == true; // Creates the temporary matrices - auto a1_temp = (a1_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto a2_temp = (a2_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto b1_temp = (b1_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + auto a1_temp = (a1_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto a2_temp = (a2_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto b1_temp = (b1_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer(context_, n_ceiled*n_ceiled); // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index ae350050..5174e9ab 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -27,8 +27,8 @@ template <> const Precision Xherk::precision_ = Precision::kComp // Constructor: forwards to base class constructor template -Xherk::Xherk(CommandQueue &queue, Event &event): - Routine(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xherk::Xherk(Queue &queue, Event &event): + Routine(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/copy.opencl" #include "../../kernels/pad.opencl" @@ -45,9 +45,9 @@ template StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const U alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } @@ -98,9 +98,9 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons a_rotated == false && b_conjugate == false; // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer(context_, n_ceiled*n_ceiled); // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc index 1d17f0eb..37c08d3b 100644 --- a/src/routines/level3/xsymm.cc +++ b/src/routines/level3/xsymm.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xsymm::Xsymm(CommandQueue &queue, Event &event): +Xsymm::Xsymm(Queue &queue, Event &event): Xgemm(queue, event) { } @@ -32,10 +32,10 @@ template StatusCode Xsymm::DoSymm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } @@ -56,7 +56,7 @@ StatusCode Xsymm::DoSymm(const Layout layout, const Side side, const Triangle // Temporary buffer for a copy of the symmetric matrix try { - auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + auto temp_symm = Buffer(context_, k*k); // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm // routine afterwards diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index 7ab3430a..b36e7c5e 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -29,8 +29,8 @@ template <> const Precision Xsyr2k::precision_ = Precision::kComplexDou // Constructor: forwards to base class constructor template -Xsyr2k::Xsyr2k(CommandQueue &queue, Event &event): - Routine(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xsyr2k::Xsyr2k(Queue &queue, Event &event): + Routine(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/copy.opencl" #include "../../kernels/pad.opencl" @@ -47,10 +47,10 @@ template StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } @@ -99,9 +99,9 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons ab_rotated == false; // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer(context_, n_ceiled*n_ceiled); // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index c6feb5e6..e4668216 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -29,8 +29,8 @@ template <> const Precision Xsyrk::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xsyrk::Xsyrk(CommandQueue &queue, Event &event): - Routine(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xsyrk::Xsyrk(Queue &queue, Event &event): + Routine(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/copy.opencl" #include "../../kernels/pad.opencl" @@ -47,9 +47,9 @@ template StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } @@ -93,8 +93,8 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const a_rotated == false; // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer(context_, n_ceiled*n_ceiled); // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index 52f272e3..8be7d950 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xtrmm::Xtrmm(CommandQueue &queue, Event &event): +Xtrmm::Xtrmm(Queue &queue, Event &event): Xgemm(queue, event) { } @@ -33,8 +33,8 @@ StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } @@ -58,7 +58,7 @@ StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle // Temporary buffer for a copy of the triangular matrix try { - auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + auto temp_triangular = Buffer(context_, k*k); // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm // routine afterwards diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index 5951b177..ff81f4c3 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -76,31 +76,31 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st for (auto &args: test_vector) { // Runs the reference clBLAS code - auto x_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T)); - auto y_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T)); - auto a_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T)); - auto b_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T)); - auto c_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T)); - x_vec1.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_); - y_vec1.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_); - a_mat1.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_); - b_mat1.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_); - c_mat1.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_); - auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1}; + auto x_vec1 = Buffer(context_, args.x_size); + auto y_vec1 = Buffer(context_, args.y_size); + auto a_mat1 = Buffer(context_, args.a_size); + auto b_mat1 = Buffer(context_, args.b_size); + auto c_mat1 = Buffer(context_, args.c_size); + x_vec1.Write(queue_, args.x_size, x_source_); + y_vec1.Write(queue_, args.y_size, y_source_); + a_mat1.Write(queue_, args.a_size, a_source_); + b_mat1.Write(queue_, args.b_size, b_source_); + c_mat1.Write(queue_, args.c_size, c_source_); + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1}; auto status1 = run_reference_(args, buffers1, queue_); // Runs the CLBlast code - auto x_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T)); - auto y_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T)); - auto a_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T)); - auto b_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T)); - auto c_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T)); - x_vec2.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_); - y_vec2.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_); - a_mat2.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_); - b_mat2.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_); - c_mat2.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_); - auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2}; + auto x_vec2 = Buffer(context_, args.x_size); + auto y_vec2 = Buffer(context_, args.y_size); + auto a_mat2 = Buffer(context_, args.a_size); + auto b_mat2 = Buffer(context_, args.b_size); + auto c_mat2 = Buffer(context_, args.c_size); + x_vec2.Write(queue_, args.x_size, x_source_); + y_vec2.Write(queue_, args.y_size, y_source_); + a_mat2.Write(queue_, args.a_size, a_source_); + b_mat2.Write(queue_, args.b_size, b_source_); + c_mat2.Write(queue_, args.c_size, c_source_); + auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2}; auto status2 = run_routine_(args, buffers2, queue_); // Tests for equality of the two status codes @@ -149,25 +149,25 @@ void TestBlas::TestInvalid(std::vector> &test_vector, const st auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); - auto x_vec1 = Buffer(x1); - auto y_vec1 = Buffer(y1); - auto a_mat1 = Buffer(a1); - auto b_mat1 = Buffer(b1); - auto c_mat1 = Buffer(c1); + auto x_vec1 = Buffer(x1); + auto y_vec1 = Buffer(y1); + auto a_mat1 = Buffer(a1); + auto b_mat1 = Buffer(b1); + auto c_mat1 = Buffer(c1); auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); - auto x_vec2 = Buffer(x2); - auto y_vec2 = Buffer(y2); - auto a_mat2 = Buffer(a2); - auto b_mat2 = Buffer(b2); - auto c_mat2 = Buffer(c2); + auto x_vec2 = Buffer(x2); + auto y_vec2 = Buffer(y2); + auto a_mat2 = Buffer(a2); + auto b_mat2 = Buffer(b2); + auto c_mat2 = Buffer(c2); // Runs the two routines - auto status1 = run_reference_(args, Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1}, queue_); - auto status2 = run_routine_(args, Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2}, queue_); + auto status1 = run_reference_(args, Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1}, queue_); + auto status2 = run_routine_(args, Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2}, queue_); // Tests for equality of the two status codes TestErrorCodes(status1, status2, args); diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index 96c140c1..af8a4d0e 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -66,8 +66,8 @@ class TestBlas: public Tester { static const std::vector kTransposes; // Data-type dependent, see .cc-file // Shorthand for the routine-specific functions passed to the tester - using Routine = std::function&, const Buffers&, CommandQueue&)>; - using ResultGet = std::function(const Arguments&, Buffers&, CommandQueue&)>; + using Routine = std::function&, const Buffers&, Queue&)>; + using ResultGet = std::function(const Arguments&, Buffers&, Queue&)>; using ResultIndex = std::function&, const size_t, const size_t)>; using ResultIterator = std::function&)>; diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index 378968ed..002cb1a6 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -28,9 +28,9 @@ Tester::Tester(int argc, char *argv[], const bool silent, const std::string &name, const std::vector &options): help_("Options given/available:\n"), platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))), - device_(Device(platform_, kDeviceType, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))), + device_(Device(platform_, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))), context_(Context(device_)), - queue_(CommandQueue(context_, device_)), + queue_(Queue(context_, device_)), full_test_(CheckArgument(argc, argv, help_, kArgFullTest)), error_log_{}, num_passed_{0}, @@ -339,11 +339,11 @@ template <> const std::vector GetExampleScalars(const bool full_test) { template <> bool PrecisionSupported(const Device &) { return true; } template <> bool PrecisionSupported(const Device &) { return true; } template <> bool PrecisionSupported(const Device &device) { - auto extensions = device.Extensions(); + auto extensions = device.Capabilities(); return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true; } template <> bool PrecisionSupported(const Device &device) { - auto extensions = device.Extensions(); + auto extensions = device.Capabilities(); return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true; } diff --git a/test/correctness/tester.h b/test/correctness/tester.h index 93515138..06f4afbe 100644 --- a/test/correctness/tester.h +++ b/test/correctness/tester.h @@ -36,9 +36,6 @@ template class Tester { public: - // Types of devices to consider - const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL; - // Maximum number of test results printed on a single line static constexpr auto kResultsPerLine = size_t{64}; @@ -92,7 +89,7 @@ class Tester { Platform platform_; Device device_; Context context_; - CommandQueue queue_; + Queue queue_; // Whether or not to run the full test-suite or just a smoke test bool full_test_; diff --git a/test/performance/client.cc b/test/performance/client.cc index 676e88e4..893bb55d 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -110,9 +110,9 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) // Initializes OpenCL and the libraries auto platform = Platform(args.platform_id); - auto device = Device(platform, kDeviceType, args.device_id); + auto device = Device(platform, args.device_id); auto context = Context(device); - auto queue = CommandQueue(context, device); + auto queue = Queue(context, device); if (args.compare_clblas) { clblasSetup(); } // Iterates over all "num_step" values jumping by "step" each time @@ -135,17 +135,17 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) PopulateVector(c_source); // Creates the matrices on the device - auto x_vec = Buffer(context, CL_MEM_READ_WRITE, args.x_size*sizeof(T)); - auto y_vec = Buffer(context, CL_MEM_READ_WRITE, args.y_size*sizeof(T)); - auto a_mat = Buffer(context, CL_MEM_READ_WRITE, args.a_size*sizeof(T)); - auto b_mat = Buffer(context, CL_MEM_READ_WRITE, args.b_size*sizeof(T)); - auto c_mat = Buffer(context, CL_MEM_READ_WRITE, args.c_size*sizeof(T)); - x_vec.WriteBuffer(queue, args.x_size*sizeof(T), x_source); - y_vec.WriteBuffer(queue, args.y_size*sizeof(T), y_source); - a_mat.WriteBuffer(queue, args.a_size*sizeof(T), a_source); - b_mat.WriteBuffer(queue, args.b_size*sizeof(T), b_source); - c_mat.WriteBuffer(queue, args.c_size*sizeof(T), c_source); - auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat}; + auto x_vec = Buffer(context, args.x_size); + auto y_vec = Buffer(context, args.y_size); + auto a_mat = Buffer(context, args.a_size); + auto b_mat = Buffer(context, args.b_size); + auto c_mat = Buffer(context, args.c_size); + x_vec.Write(queue, args.x_size, x_source); + y_vec.Write(queue, args.y_size, y_source); + a_mat.Write(queue, args.a_size, a_source); + b_mat.Write(queue, args.b_size, b_source); + c_mat.Write(queue, args.c_size, c_source); + auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat}; // Runs the routines and collects the timings auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast"); @@ -176,7 +176,7 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) // value found in the vector of timing results. The return value is in milliseconds. template double Client::TimedExecution(const size_t num_runs, const Arguments &args, - const Buffers &buffers, CommandQueue &queue, + const Buffers &buffers, Queue &queue, Routine run_blas, const std::string &library_name) { auto timings = std::vector(num_runs); for (auto &timing: timings) { diff --git a/test/performance/client.h b/test/performance/client.h index c9095967..9f6852d0 100644 --- a/test/performance/client.h +++ b/test/performance/client.h @@ -38,11 +38,8 @@ template class Client { public: - // Types of devices to consider - const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL; - // Shorthand for the routine-specific functions passed to the tester - using Routine = std::function&, const Buffers&, CommandQueue&)>; + using Routine = std::function&, const Buffers&, Queue&)>; using SetMetric = std::function&)>; using GetMetric = std::function&)>; @@ -63,8 +60,8 @@ class Client { private: // Runs a function a given number of times and returns the execution time of the shortest instance - double TimedExecution(const size_t num_runs, const Arguments &args, const Buffers &buffers, - CommandQueue &queue, Routine run_blas, const std::string &library_name); + double TimedExecution(const size_t num_runs, const Arguments &args, const Buffers &buffers, + Queue &queue, Routine run_blas, const std::string &library_name); // Prints the header of a performance-data table void PrintTableHeader(const bool silent, const std::vector &args); diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.h index 6ce5d7e2..866fb620 100644 --- a/test/routines/level1/xaxpy.h +++ b/test/routines/level1/xaxpy.h @@ -57,8 +57,7 @@ class TestXaxpy { static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Axpy(args.n, args.alpha, @@ -70,8 +69,7 @@ class TestXaxpy { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXaxpy(args.n, args.alpha, @@ -83,10 +81,9 @@ class TestXaxpy { } // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); - buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result); + buffers.y_vec.Read(queue, args.y_size, result); return result; } diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h index 73f7d76e..056dec30 100644 --- a/test/routines/level2/xgemv.h +++ b/test/routines/level2/xgemv.h @@ -68,8 +68,7 @@ class TestXgemv { static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemv(args.layout, args.a_transpose, @@ -83,8 +82,7 @@ class TestXgemv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgemv(static_cast(args.layout), @@ -99,10 +97,9 @@ class TestXgemv { } // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); - buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result); + buffers.y_vec.Read(queue, args.y_size, result); return result; } diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h index 86a304d1..f06719d6 100644 --- a/test/routines/level3/xgemm.h +++ b/test/routines/level3/xgemm.h @@ -70,8 +70,7 @@ class TestXgemm { static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, @@ -85,8 +84,7 @@ class TestXgemm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgemm(static_cast(args.layout), @@ -102,10 +100,9 @@ class TestXgemm { } // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + buffers.c_mat.Read(queue, args.c_size, result); return result; } diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h index 75878b06..0c3b9c31 100644 --- a/test/routines/level3/xhemm.h +++ b/test/routines/level3/xhemm.h @@ -70,8 +70,7 @@ class TestXhemm { static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemm(args.layout, args.side, args.triangle, @@ -85,8 +84,7 @@ class TestXhemm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhemm(static_cast(args.layout), @@ -102,10 +100,9 @@ class TestXhemm { } // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + buffers.c_mat.Read(queue, args.c_size, result); return result; } diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h index f13e8a62..b20ec973 100644 --- a/test/routines/level3/xher2k.h +++ b/test/routines/level3/xher2k.h @@ -68,8 +68,7 @@ class TestXher2k { static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; @@ -84,8 +83,7 @@ class TestXher2k { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; @@ -102,10 +100,9 @@ class TestXher2k { } // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + buffers.c_mat.Read(queue, args.c_size, result); return result; } diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h index 780b9b52..20c2b4b8 100644 --- a/test/routines/level3/xherk.h +++ b/test/routines/level3/xherk.h @@ -61,8 +61,7 @@ class TestXherk { static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Herk(args.layout, args.triangle, args.a_transpose, @@ -75,8 +74,7 @@ class TestXherk { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXherk(static_cast(args.layout), @@ -91,10 +89,9 @@ class TestXherk { } // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + buffers.c_mat.Read(queue, args.c_size, result); return result; } diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h index 10476349..5b5ad351 100644 --- a/test/routines/level3/xsymm.h +++ b/test/routines/level3/xsymm.h @@ -70,8 +70,7 @@ class TestXsymm { static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Symm(args.layout, args.side, args.triangle, @@ -85,8 +84,7 @@ class TestXsymm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsymm(static_cast(args.layout), @@ -102,10 +100,9 @@ class TestXsymm { } // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + buffers.c_mat.Read(queue, args.c_size, result); return result; } diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h index f3b1b542..21fcee2a 100644 --- a/test/routines/level3/xsyr2k.h +++ b/test/routines/level3/xsyr2k.h @@ -68,8 +68,7 @@ class TestXsyr2k { static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2k(args.layout, args.triangle, args.a_transpose, @@ -83,8 +82,7 @@ class TestXsyr2k { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyr2k(static_cast(args.layout), @@ -100,10 +98,9 @@ class TestXsyr2k { } // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + buffers.c_mat.Read(queue, args.c_size, result); return result; } diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h index 2ec9fb65..c92693c2 100644 --- a/test/routines/level3/xsyrk.h +++ b/test/routines/level3/xsyrk.h @@ -61,8 +61,7 @@ class TestXsyrk { static size_t DefaultLDC(const Arguments &args) { return args.n; } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syrk(args.layout, args.triangle, args.a_transpose, @@ -75,8 +74,7 @@ class TestXsyrk { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyrk(static_cast(args.layout), @@ -91,10 +89,9 @@ class TestXsyrk { } // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.c_size, static_cast(0)); - buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result); + buffers.c_mat.Read(queue, args.c_size, result); return result; } diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h index 7b7e7af1..d5a52903 100644 --- a/test/routines/level3/xtrmm.h +++ b/test/routines/level3/xtrmm.h @@ -61,8 +61,7 @@ class TestXtrmm { static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, @@ -75,8 +74,7 @@ class TestXtrmm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, - CommandQueue &queue) { + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrmm(static_cast(args.layout), @@ -93,10 +91,9 @@ class TestXtrmm { } // Describes how to download the results of the computation (more importantly: which buffer) - static std::vector DownloadResult(const Arguments &args, Buffers &buffers, - CommandQueue &queue) { + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.b_size, static_cast(0)); - buffers.b_mat.ReadBuffer(queue, args.b_size*sizeof(T), result); + buffers.b_mat.Read(queue, args.b_size, result); return result; } -- cgit v1.2.3