path: root/src/cupp11.hpp
diff options
Diffstat (limited to 'src/cupp11.hpp')
1 files changed, 782 insertions, 0 deletions
diff --git a/src/cupp11.hpp b/src/cupp11.hpp
new file mode 100644
index 00000000..ec21c5b1
--- /dev/null
+++ b/src/cupp11.hpp
@@ -0,0 +1,782 @@
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+// Author(s):
+// Cedric Nugteren <>
+// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API
+// calls. The main benefits are increased abstraction, automatic memory management, and portability.
+// Portability here means that a similar header exists for CUDA with the same classes and
+// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
+// This file is taken from the CLCudaAPI project <> and
+// therefore contains the following header copyright notice:
+// =================================================================================================
+// Copyright 2015 SURFsara
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =================================================================================================
+#ifndef CLBLAST_CUPP11_H_
+#define CLBLAST_CUPP11_H_
+// C++
+#include <algorithm> // std::copy
+#include <string> // std::string
+#include <vector> // std::vector
+#include <memory> // std::shared_ptr
+#include <cstring> // std::strlen
+// CUDA
+#define CUDA_NO_HALF // Incompatible with CLBlast's definition; TODO: resolve this
+#include <cuda.h> // CUDA driver API
+#include <nvrtc.h> // NVIDIA runtime compilation API
+// Exception classes
+#include "cxpp11_common.hpp"
+namespace clblast {
+// =================================================================================================
+// Max-length of strings
+constexpr auto kStringLength = 256;
+// =================================================================================================
+// Represents a runtime error returned by a CUDA driver API function
+class CLCudaAPIError : public ErrorCode<DeviceError, CUresult> {
+ explicit CLCudaAPIError(CUresult status, const std::string &where):
+ ErrorCode(status, where, "CUDA error: " + where + ": " +
+ GetErrorName(status) + " --> " + GetErrorString(status)) {
+ }
+ static void Check(const CUresult status, const std::string &where) {
+ if (status != CUDA_SUCCESS) {
+ throw CLCudaAPIError(status, where);
+ }
+ }
+ static void CheckDtor(const CUresult status, const std::string &where) {
+ if (status != CUDA_SUCCESS) {
+ fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what());
+ }
+ }
+ std::string GetErrorName(CUresult status) const {
+ const char* status_code;
+ cuGetErrorName(status, &status_code);
+ return std::string(status_code);
+ }
+ std::string GetErrorString(CUresult status) const {
+ const char* status_string;
+ cuGetErrorString(status, &status_string);
+ return std::string(status_string);
+ }
+// Represents a runtime error returned by a CUDA runtime compilation API function
+class CLCudaAPINVRTCError : public ErrorCode<DeviceError, nvrtcResult> {
+ explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where):
+ ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) {
+ }
+ static void Check(const nvrtcResult status, const std::string &where) {
+ if (status != NVRTC_SUCCESS) {
+ throw CLCudaAPINVRTCError(status, where);
+ }
+ }
+ static void CheckDtor(const nvrtcResult status, const std::string &where) {
+ if (status != NVRTC_SUCCESS) {
+ fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what());
+ }
+ }
+ std::string GetErrorString(nvrtcResult status) const {
+ const char* status_string = nvrtcGetErrorString(status);
+ return std::string(status_string);
+ }
+// Exception returned when building a program
+using CLCudaAPIBuildError = CLCudaAPINVRTCError;
+// =================================================================================================
+// Error occurred in CUDA driver or runtime compilation API
+#define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call))
+#define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call))
+// Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors)
+#define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call))
+#define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call))
+// =================================================================================================
+// C++11 version of two 'CUevent' pointers
+class Event {
+ // Note that there is no constructor based on the regular CUDA data-type because of extra state
+ // Regular constructor with memory management
+ explicit Event():
+ start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }),
+ end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) {
+ CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT));
+ CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT));
+ }
+ // Waits for completion of this event (not implemented for CUDA)
+ void WaitForCompletion() const { } // not needed due to cuStreamSynchronize call after each kernel launch
+ // Retrieves the elapsed time of the last recorded event
+ float GetElapsedTime() const {
+ auto result = 0.0f;
+ cuEventElapsedTime(&result, *start_, *end_);
+ return result;
+ }
+ // Accessors to the private data-members
+ const CUevent& start() const { return *start_; }
+ const CUevent& end() const { return *end_; }
+ Event* pointer() { return this; }
+ std::shared_ptr<CUevent> start_;
+ std::shared_ptr<CUevent> end_;
+// Pointer to a CUDA event
+using EventPointer = Event*;
+// =================================================================================================
+// Raw platform ID type
+using RawPlatformID = size_t;
+// The CUDA platform: initializes the CUDA driver API
+class Platform {
+ // Initializes the platform. Note that the platform ID variable is not actually used for CUDA.
+ explicit Platform(const size_t platform_id) : platform_id_(0) {
+ if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); }
+ CheckError(cuInit(0));
+ }
+ // Methods to retrieve platform information
+ std::string Name() const { return "CUDA"; }
+ std::string Vendor() const { return "NVIDIA Corporation"; }
+ std::string Version() const {
+ auto result = 0;
+ CheckError(cuDriverGetVersion(&result));
+ return "CUDA driver "+std::to_string(result);
+ }
+ // Returns the number of devices on this platform
+ size_t NumDevices() const {
+ auto result = 0;
+ CheckError(cuDeviceGetCount(&result));
+ return static_cast<size_t>(result);
+ }
+ // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0)
+ const RawPlatformID& operator()() const { return platform_id_; }
+ const size_t platform_id_;
+// Retrieves a vector with all platforms. Note that there is just one platform in CUDA.
+inline std::vector<Platform> GetAllPlatforms() {
+ auto all_platforms = std::vector<Platform>{ Platform(size_t{0}) };
+ return all_platforms;
+// =================================================================================================
+// Raw device ID type
+using RawDeviceID = CUdevice;
+// C++11 version of 'CUdevice'
+class Device {
+ // Constructor based on the regular CUDA data-type
+ explicit Device(const CUdevice device): device_(device) { }
+ // Initialization
+ explicit Device(const Platform &platform, const size_t device_id) {
+ auto num_devices = platform.NumDevices();
+ if (num_devices == 0) {
+ throw RuntimeError("Device: no devices found");
+ }
+ if (device_id >= num_devices) {
+ throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
+ }
+ CheckError(cuDeviceGet(&device_, device_id));
+ }
+ // Methods to retrieve device information
+ RawPlatformID PlatformID() const { return 0; }
+ std::string Version() const {
+ auto result = 0;
+ CheckError(cuDriverGetVersion(&result));
+ return "CUDA driver "+std::to_string(result);
+ }
+ size_t VersionNumber() const {
+ auto result = 0;
+ CheckError(cuDriverGetVersion(&result));
+ return static_cast<size_t>(result);
+ }
+ std::string Vendor() const { return "NVIDIA Corporation"; }
+ std::string Name() const {
+ auto result = std::string{};
+ result.resize(kStringLength);
+ CheckError(cuDeviceGetName(&result[0], result.size(), device_));
+ result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
+ return result;
+ }
+ std::string Type() const { return "GPU"; }
+ size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); }
+ size_t MaxWorkItemDimensions() const { return size_t{3}; }
+ std::vector<size_t> MaxWorkItemSizes() const {
+ return std::vector<size_t>{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X),
+ }
+ unsigned long LocalMemSize() const {
+ return static_cast<unsigned long>(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK));
+ }
+ std::string Capabilities() const {
+ return "SM"+std::to_string(major)+"."+std::to_string(minor);
+ }
+ std::string ComputeArch() const {
+ return "compute_"+std::to_string(major)+std::to_string(minor);
+ }
+ bool HasExtension(const std::string &extension) const { return false; }
+ bool SupportsFP64() const { return true; }
+ bool SupportsFP16() const {
+ if (major > 5) { return true; } // SM 6.x, 7.x and higher
+ if (major == 5 && minor == 3) { return true; } // SM 5.3
+ return false;
+ }
+ size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); }
+ size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); }
+ unsigned long MemorySize() const {
+ auto result = size_t{0};
+ CheckError(cuDeviceTotalMem(&result, device_));
+ return static_cast<unsigned long>(result);
+ }
+ unsigned long MaxAllocSize() const { return MemorySize(); }
+ size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); }
+ size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); }
+ // Configuration-validity checks
+ bool IsLocalMemoryValid(const size_t local_mem_usage) const {
+ return (local_mem_usage <= LocalMemSize());
+ }
+ bool IsThreadConfigValid(const std::vector<size_t> &local) const {
+ auto local_size = size_t{1};
+ for (const auto &item: local) { local_size *= item; }
+ for (auto i=size_t{0}; i<local.size(); ++i) {
+ if (local[i] > MaxWorkItemSizes()[i]) { return false; }
+ }
+ if (local_size > MaxWorkGroupSize()) { return false; }
+ if (local.size() > MaxWorkItemDimensions()) { return false; }
+ return true;
+ }
+ // Query for a specific type of device or brand
+ bool IsCPU() const { return false; }
+ bool IsGPU() const { return true; }
+ bool IsAMD() const { return false; }
+ bool IsNVIDIA() const { return true; }
+ bool IsIntel() const { return false; }
+ bool IsARM() const { return false; }
+ // Platform specific extensions
+ std::string AMDBoardName() const { return ""; }
+ std::string NVIDIAComputeCapability() const { return Capabilities(); }
+ // Accessor to the private data-member
+ const RawDeviceID& operator()() const { return device_; }
+ CUdevice device_;
+ // Private helper function
+ size_t GetInfo(const CUdevice_attribute info) const {
+ auto result = 0;
+ CheckError(cuDeviceGetAttribute(&result, info, device_));
+ return static_cast<size_t>(result);
+ }
+// =================================================================================================
+// Raw context type
+using RawContext = CUcontext;
+// C++11 version of 'CUcontext'
+class Context {
+ // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
+ explicit Context(const CUcontext context):
+ context_(new CUcontext) {
+ *context_ = context;
+ }
+ // Regular constructor with memory management
+ explicit Context(const Device &device):
+ context_(new CUcontext, [](CUcontext* c) {
+ if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); }
+ delete c;
+ }) {
+ CheckError(cuCtxCreate(context_.get(), 0, device()));
+ }
+ // Accessor to the private data-member
+ const RawContext& operator()() const { return *context_; }
+ RawContext* pointer() const { return &(*context_); }
+ std::shared_ptr<CUcontext> context_;
+// Pointer to a raw CUDA context
+using ContextPointer = CUcontext*;
+// =================================================================================================
+// C++11 version of 'nvrtcProgram'. Additionally holds the program's source code.
+class Program {
+ Program() = default;
+ // Note that there is no constructor based on the regular CUDA data-type because of extra state
+ // Source-based constructor with memory management
+ explicit Program(const Context &, std::string source):
+ program_(new nvrtcProgram, [](nvrtcProgram* p) {
+ if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); }
+ delete p;
+ }),
+ source_(std::move(source)),
+ from_binary_(false) {
+ const auto source_ptr = &source_[0];
+ CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr));
+ }
+ // PTX-based constructor
+ explicit Program(const Device &device, const Context &context, const std::string &binary):
+ program_(nullptr), // not used
+ source_(binary),
+ from_binary_(true) {
+ }
+ // Compiles the device program and checks whether or not there are any warnings/errors
+ void Build(const Device &device, std::vector<std::string> &options) {
+ options.push_back("-arch=" + device.ComputeArch());
+ if (from_binary_) { return; }
+ auto raw_options = std::vector<const char*>();
+ for (const auto &option: options) {
+ raw_options.push_back(option.c_str());
+ }
+ auto status = nvrtcCompileProgram(*program_, raw_options.size(),;
+ CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram");
+ CheckError(cuModuleLoadDataEx(&module_, GetIR().data(), 0, nullptr, nullptr));
+ }
+ // Confirms whether a certain status code is an actual compilation error or warning
+ bool StatusIsCompilationWarningOrError(const nvrtcResult status) const {
+ return (status == NVRTC_ERROR_COMPILATION);
+ }
+ // Retrieves the warning/error message from the compiler (if any)
+ std::string GetBuildInfo(const Device &) const {
+ if (from_binary_) { return std::string{}; }
+ auto bytes = size_t{0};
+ CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes));
+ auto result = std::string{};
+ result.resize(bytes);
+ CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0]));
+ return result;
+ }
+ // Retrieves an intermediate representation of the compiled program (i.e. PTX)
+ std::string GetIR() const {
+ if (from_binary_) { return source_; } // holds the PTX
+ auto bytes = size_t{0};
+ CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes));
+ auto result = std::string{};
+ result.resize(bytes);
+ CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0]));
+ return result;
+ }
+ // Accessor to the private data-members
+ const CUmodule GetModule() const { return module_; }
+ const nvrtcProgram& operator()() const { return *program_; }
+ std::shared_ptr<nvrtcProgram> program_;
+ CUmodule module_;
+ std::string source_;
+ bool from_binary_;
+// =================================================================================================
+// Raw command-queue type
+using RawCommandQueue = CUstream;
+// C++11 version of 'CUstream'
+class Queue {
+ // Note that there is no constructor based on the regular CUDA data-type because of extra state
+ // Regular constructor with memory management
+ explicit Queue(const Context &context, const Device &device):
+ queue_(new CUstream, [](CUstream* s) {
+ if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); }
+ delete s;
+ }),
+ context_(context),
+ device_(device) {
+ CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING));
+ }
+ // Synchronizes the queue and optionally also an event
+ void Finish(Event &event) const {
+ CheckError(cuEventSynchronize(event.end()));
+ Finish();
+ }
+ void Finish() const {
+ CheckError(cuStreamSynchronize(*queue_));
+ }
+ // Retrieves the corresponding context or device
+ Context GetContext() const { return context_; }
+ Device GetDevice() const { return device_; }
+ // Accessor to the private data-member
+ const RawCommandQueue& operator()() const { return *queue_; }
+ std::shared_ptr<CUstream> queue_;
+ const Context context_;
+ const Device device_;
+// =================================================================================================
+// C++11 version of page-locked host memory
+template <typename T>
+class BufferHost {
+ // Regular constructor with memory management
+ explicit BufferHost(const Context &, const size_t size):
+ buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }),
+ size_(size) {
+ CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T)));
+ }
+ // Retrieves the actual allocated size in bytes
+ size_t GetSize() const {
+ return size_*sizeof(T);
+ }
+ // Compatibility with std::vector
+ size_t size() const { return size_; }
+ T* begin() { return &static_cast<T*>(*buffer_)[0]; }
+ T* end() { return &static_cast<T*>(*buffer_)[size_-1]; }
+ T& operator[](const size_t i) { return static_cast<T*>(*buffer_)[i]; }
+ T* data() { return static_cast<T*>(*buffer_); }
+ const T* data() const { return static_cast<T*>(*buffer_); }
+ std::shared_ptr<void*> buffer_;
+ const size_t size_;
+// =================================================================================================
+// Enumeration of buffer access types
+enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
+// C++11 version of 'CUdeviceptr'
+template <typename T>
+class Buffer {
+ // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
+ explicit Buffer(const CUdeviceptr buffer):
+ buffer_(new CUdeviceptr),
+ access_(BufferAccess::kNotOwned) {
+ *buffer_ = buffer;
+ }
+ // Regular constructor with memory management. If this class does not own the buffer object, then
+ // the memory will not be freed automatically afterwards.
+ explicit Buffer(const Context &, const BufferAccess access, const size_t size):
+ buffer_(new CUdeviceptr, [access](CUdeviceptr* m) {
+ if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); }
+ delete m;
+ }),
+ access_(access) {
+ CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T)));
+ }
+ // As above, but now with read/write access as a default
+ explicit Buffer(const Context &context, const size_t size):
+ Buffer<T>(context, BufferAccess::kReadWrite, size) {
+ }
+ // Constructs a new buffer based on an existing host-container
+ template <typename Iterator>
+ explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
+ Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
+ auto size = static_cast<size_t>(end - start);
+ auto pointer = &*start;
+ CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue()));
+ queue.Finish();
+ }
+ // Copies from device to host: reading the device buffer a-synchronously
+ void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
+ if (access_ == BufferAccess::kWriteOnly) {
+ throw LogicError("Buffer: reading from a write-only buffer");
+ }
+ CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue()));
+ }
+ void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
+ const size_t offset = 0) const {
+ if (host.size() < size) {
+ throw LogicError("Buffer: target host buffer is too small");
+ }
+ ReadAsync(queue, size,, offset);
+ }
+ void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
+ const size_t offset = 0) const {
+ if (host.size() < size) {
+ throw LogicError("Buffer: target host buffer is too small");
+ }
+ ReadAsync(queue, size,, offset);
+ }
+ // Copies from device to host: reading the device buffer
+ void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
+ ReadAsync(queue, size, host, offset);
+ queue.Finish();
+ }
+ void Read(const Queue &queue, const size_t size, std::vector<T> &host,
+ const size_t offset = 0) const {
+ Read(queue, size,, offset);
+ }
+ void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
+ const size_t offset = 0) const {
+ Read(queue, size,, offset);
+ }
+ // Copies from host to device: writing the device buffer a-synchronously
+ void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
+ if (access_ == BufferAccess::kReadOnly) {
+ throw LogicError("Buffer: writing to a read-only buffer");
+ }
+ if (GetSize() < (offset+size)*sizeof(T)) {
+ throw LogicError("Buffer: target device buffer is too small");
+ }
+ CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue()));
+ }
+ void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
+ const size_t offset = 0) {
+ WriteAsync(queue, size,, offset);
+ }
+ void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
+ const size_t offset = 0) {
+ WriteAsync(queue, size,, offset);
+ }
+ // Copies from host to device: writing the device buffer
+ void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
+ WriteAsync(queue, size, host, offset);
+ queue.Finish();
+ }
+ void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
+ const size_t offset = 0) {
+ Write(queue, size,, offset);
+ }
+ void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
+ const size_t offset = 0) {
+ Write(queue, size,, offset);
+ }
+ // Copies the contents of this buffer into another device buffer
+ void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
+ CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue()));
+ }
+ void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
+ CopyToAsync(queue, size, destination);
+ queue.Finish();
+ }
+ // Retrieves the actual allocated size in bytes
+ size_t GetSize() const {
+ auto result = size_t{0};
+ CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_));
+ return result;
+ }
+ // Accessors to the private data-members
+ CUdeviceptr operator()() const { return *buffer_; }
+ CUdeviceptr& operator()() { return *buffer_; }
+ std::shared_ptr<CUdeviceptr> buffer_;
+ const BufferAccess access_;
+// =================================================================================================
+// C++11 version of 'CUfunction'
+class Kernel {
+ // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
+ explicit Kernel(const CUfunction kernel):
+ name_("unknown"),
+ kernel_(kernel) {
+ }
+ // Regular constructor with memory management
+ explicit Kernel(const Program &program, const std::string &name): name_(name) {
+ CheckError(cuModuleGetFunction(&kernel_, program.GetModule(), name.c_str()));
+ }
+ // Sets a kernel argument at the indicated position. This stores both the value of the argument
+ // (as raw bytes) and the index indicating where this value can be found.
+ template <typename T>
+ void SetArgument(const size_t index, const T &value) {
+ if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); }
+ arguments_indices_[index] = arguments_data_.size();
+ for (auto j=size_t(0); j<sizeof(T); ++j) {
+ arguments_data_.push_back(reinterpret_cast<const char*>(&value)[j]);
+ }
+ }
+ template <typename T>
+ void SetArgument(const size_t index, Buffer<T> &value) {
+ SetArgument(index, value());
+ }
+ // Sets all arguments in one go using parameter packs. Note that this resets all previously set
+ // arguments using 'SetArgument' or 'SetArguments'.
+ template <typename... Args>
+ void SetArguments(Args&... args) {
+ arguments_indices_.clear();
+ arguments_data_.clear();
+ SetArgumentsRecursive(0, args...);
+ }
+ // Retrieves the amount of local memory used per work-group for this kernel. Note that this the
+ // shared memory in CUDA terminology.
+ unsigned long LocalMemUsage(const Device &) const {
+ auto result = 0;
+ CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_));
+ return static_cast<unsigned long>(result);
+ }
+ // Retrieves the name of the kernel
+ std::string GetFunctionName() const {
+ return name_;
+ }
+ // Launches a kernel onto the specified queue
+ void Launch(const Queue &queue, const std::vector<size_t> &global,
+ const std::vector<size_t> &local, EventPointer event) {
+ // TODO: Currently this CUDA launch is always synchronous due to a cuStreamSynchronize call
+ if (local.size() == 0) {
+ throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end");
+ }
+ // Creates the grid (number of threadblocks) and sets the block sizes (threads per block)
+ auto grid = std::vector<size_t>{1, 1, 1};
+ auto block = std::vector<size_t>{1, 1, 1};
+ if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); }
+ for (auto i=size_t{0}; i<local.size(); ++i) { grid[i] = global[i]/local[i]; }
+ for (auto i=size_t{0}; i<local.size(); ++i) { block[i] = local[i]; }
+ // Creates the array of pointers from the arrays of indices & data
+ std::vector<void*> pointers;
+ for (auto &index: arguments_indices_) {
+ pointers.push_back(&arguments_data_[index]);
+ }
+ // Launches the kernel, its execution time is recorded by events
+ if (event) { CheckError(cuEventRecord(event->start(), queue())); }
+ CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2],
+ 0, queue(),, nullptr));
+ cuStreamSynchronize(queue());
+ if (event) { CheckError(cuEventRecord(event->end(), queue())); }
+ }
+ // As above, but with an event waiting list
+ void Launch(const Queue &queue, const std::vector<size_t> &global,
+ const std::vector<size_t> &local, EventPointer event,
+ const std::vector<Event>& waitForEvents) {
+ for (auto &waitEvent : waitForEvents) {
+ waitEvent.WaitForCompletion(); // note: doesn't do anything, every kernel call is synchronous
+ }
+ return Launch(queue, global, local, event);
+ }
+ // Accessors to the private data-members
+ const CUfunction& operator()() const { return kernel_; }
+ CUfunction operator()() { return kernel_; }
+ const std::string name_;
+ CUfunction kernel_;
+ std::vector<size_t> arguments_indices_; // Indices of the arguments
+ std::vector<char> arguments_data_; // The arguments data as raw bytes
+ // Internal implementation for the recursive SetArguments function.
+ template <typename T>
+ void SetArgumentsRecursive(const size_t index, T &first) {
+ SetArgument(index, first);
+ }
+ template <typename T, typename... Args>
+ void SetArgumentsRecursive(const size_t index, T &first, Args&... args) {
+ SetArgument(index, first);
+ SetArgumentsRecursive(index+1, args...);
+ }
+// =================================================================================================
+} // namespace clblast