diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-11 23:16:57 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-11 23:16:57 +0200 |
commit | b901809345848b44442c787380b13db5e5156df0 (patch) | |
tree | 0f14f5e38c08b604a96304abda427fe6ce3f64d6 /src/cupp11.hpp | |
parent | 9224da19ef384c1a7986587a682035905f63cf55 (diff) |
Added first (untested) version of a CUDA API
Diffstat (limited to 'src/cupp11.hpp')
-rw-r--r-- | src/cupp11.hpp | 770 |
1 files changed, 770 insertions, 0 deletions
diff --git a/src/cupp11.hpp b/src/cupp11.hpp new file mode 100644 index 00000000..988366ea --- /dev/null +++ b/src/cupp11.hpp @@ -0,0 +1,770 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API +// calls. The main benefits are increased abstraction, automatic memory management, and portability. +// Portability here means that a similar header exists for CUDA with the same classes and +// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. +// +// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and +// therefore contains the following header copyright notice: +// +// ================================================================================================= +// +// Copyright 2015 SURFsara +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ================================================================================================= + +#ifndef CLBLAST_CUPP11_H_ +#define CLBLAST_CUPP11_H_ + +// C++ +#include <algorithm> // std::copy +#include <string> // std::string +#include <vector> // std::vector +#include <memory> // std::shared_ptr + +// CUDA +#include <cuda.h> // CUDA driver API +#include <nvrtc.h> // NVIDIA runtime compilation API + +// Exception classes +#include "cxpp11_common.hpp" + +namespace clblast { +// ================================================================================================= + +// Max-length of strings +constexpr auto kStringLength = 256; + +// ================================================================================================= + +// Represents a runtime error returned by a CUDA driver API function +class CLCudaAPIError : public ErrorCode<DeviceError, CUresult> { +public: + explicit CLCudaAPIError(CUresult status, const std::string &where): + ErrorCode(status, where, "CUDA error: " + where + ": " + + GetErrorName(status) + " --> " + GetErrorString(status)) { + } + + static void Check(const CUresult status, const std::string &where) { + if (status != CUDA_SUCCESS) { + throw CLCudaAPIError(status, where); + } + } + + static void CheckDtor(const CUresult status, const std::string &where) { + if (status != CUDA_SUCCESS) { + fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what()); + } + } + +private: + std::string GetErrorName(CUresult status) const { + const char* status_code; + cuGetErrorName(status, &status_code); + return std::string(status_code); + } + std::string GetErrorString(CUresult status) const { + const char* status_string; + cuGetErrorString(status, &status_string); + return std::string(status_string); + } +}; + +// Represents a runtime error returned by a CUDA runtime compilation API function +class CLCudaAPINVRTCError : public ErrorCode<DeviceError, nvrtcResult> { +public: + explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where): + ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) { + } + + static void Check(const nvrtcResult status, const std::string &where) { + if (status != NVRTC_SUCCESS) { + throw CLCudaAPINVRTCError(status, where); + } + } + + static void CheckDtor(const nvrtcResult status, const std::string &where) { + if (status != NVRTC_SUCCESS) { + fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what()); + } + } + +private: + std::string GetErrorString(nvrtcResult status) const { + const char* status_string = nvrtcGetErrorString(status); + return std::string(status_string); + } +}; + +// Exception returned when building a program +using CLCudaAPIBuildError = CLCudaAPINVRTCError; + +// ================================================================================================= + +// Error occurred in CUDA driver or runtime compilation API +#define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call)) +#define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call)) + +// Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors) +#define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call)) +#define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call)) + +// ================================================================================================= + +// C++11 version of two 'CUevent' pointers +class Event { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Regular constructor with memory management + explicit Event(): + start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }), + end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) { + CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT)); + CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT)); + } + + // Waits for completion of this event (not implemented for CUDA) + void WaitForCompletion() const { } + + // Retrieves the elapsed time of the last recorded event + float GetElapsedTime() const { + auto result = 0.0f; + cuEventElapsedTime(&result, *start_, *end_); + return result; + } + + // Accessors to the private data-members + const CUevent& start() const { return *start_; } + const CUevent& end() const { return *end_; } + Event* pointer() { return this; } +private: + std::shared_ptr<CUevent> start_; + std::shared_ptr<CUevent> end_; +}; + +// Pointer to a CUDA event +using EventPointer = Event*; + +// ================================================================================================= + +// Raw platform ID type +using RawPlatformID = size_t; + +// The CUDA platform: initializes the CUDA driver API +class Platform { +public: + + // Initializes the platform. Note that the platform ID variable is not actually used for CUDA. + explicit Platform(const size_t platform_id) : platform_id_(0) { + if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); } + CheckError(cuInit(0)); + } + + // Methods to retrieve platform information + std::string Name() const { return "CUDA"; } + std::string Vendor() const { return "NVIDIA Corporation"; } + std::string Version() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return "CUDA driver "+std::to_string(result); + } + + // Returns the number of devices on this platform + size_t NumDevices() const { + auto result = 0; + CheckError(cuDeviceGetCount(&result)); + return static_cast<size_t>(result); + } + + // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0) + const RawPlatformID& operator()() const { return platform_id_; } +private: + const size_t platform_id_; +}; + +// Retrieves a vector with all platforms. Note that there is just one platform in CUDA. +inline std::vector<Platform> GetAllPlatforms() { + auto all_platforms = std::vector<Platform>{ Platform(size_t{0}) }; + return all_platforms; +} + +// ================================================================================================= + +// Raw device ID type +using RawDeviceID = CUdevice; + +// C++11 version of 'CUdevice' +class Device { +public: + + // Constructor based on the regular CUDA data-type + explicit Device(const CUdevice device): device_(device) { } + + // Initialization + explicit Device(const Platform &platform, const size_t device_id) { + auto num_devices = platform.NumDevices(); + if (num_devices == 0) { + throw RuntimeError("Device: no devices found"); + } + if (device_id >= num_devices) { + throw RuntimeError("Device: invalid device ID "+std::to_string(device_id)); + } + + CheckError(cuDeviceGet(&device_, device_id)); + } + + // Methods to retrieve device information + RawPlatformID PlatformID() const { return 0; } + std::string Version() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return "CUDA driver "+std::to_string(result); + } + size_t VersionNumber() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return static_cast<size_t>(result); + } + std::string Vendor() const { return "NVIDIA Corporation"; } + std::string Name() const { + auto result = std::string{}; + result.resize(kStringLength); + CheckError(cuDeviceGetName(&result[0], result.size(), device_)); + return result; + } + std::string Type() const { return "GPU"; } + size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); } + size_t MaxWorkItemDimensions() const { return size_t{3}; } + std::vector<size_t> MaxWorkItemSizes() const { + return std::vector<size_t>{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X), + GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y), + GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)}; + } + unsigned long LocalMemSize() const { + return static_cast<unsigned long>(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)); + } + + std::string Capabilities() const { + const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + return "SM"+std::to_string(major)+"."+std::to_string(minor); + } + bool HasExtension(const std::string &extension) const { return false; } + bool SupportsFP64() const { return true; } + bool SupportsFP16() const { + const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + if (major > 5) { return true; } // SM 6.x, 7.x and higher + if (major == 5 && minor == 3) { return true; } // SM 5.3 + return false; + } + + size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); } + size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); } + unsigned long MemorySize() const { + auto result = size_t{0}; + CheckError(cuDeviceTotalMem(&result, device_)); + return static_cast<unsigned long>(result); + } + unsigned long MaxAllocSize() const { return MemorySize(); } + size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); } + size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); } + + // Configuration-validity checks + bool IsLocalMemoryValid(const size_t local_mem_usage) const { + return (local_mem_usage <= LocalMemSize()); + } + bool IsThreadConfigValid(const std::vector<size_t> &local) const { + auto local_size = size_t{1}; + for (const auto &item: local) { local_size *= item; } + for (auto i=size_t{0}; i<local.size(); ++i) { + if (local[i] > MaxWorkItemSizes()[i]) { return false; } + } + if (local_size > MaxWorkGroupSize()) { return false; } + if (local.size() > MaxWorkItemDimensions()) { return false; } + return true; + } + + // Query for a specific type of device or brand + bool IsCPU() const { return false; } + bool IsGPU() const { return true; } + bool IsAMD() const { return false; } + bool IsNVIDIA() const { return true; } + bool IsIntel() const { return false; } + bool IsARM() const { return false; } + + // Platform specific extensions + std::string AMDBoardName() const { return ""; } + std::string NVIDIAComputeCapability() const { return Capabilities(); } + + // Accessor to the private data-member + const RawDeviceID& operator()() const { return device_; } +private: + CUdevice device_; + + // Private helper function + size_t GetInfo(const CUdevice_attribute info) const { + auto result = 0; + CheckError(cuDeviceGetAttribute(&result, info, device_)); + return static_cast<size_t>(result); + } +}; + +// ================================================================================================= + +// Raw context type +using RawContext = CUcontext; + +// C++11 version of 'CUcontext' +class Context { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Context(const CUcontext context): + context_(new CUcontext) { + *context_ = context; + } + + // Regular constructor with memory management + explicit Context(const Device &device): + context_(new CUcontext, [](CUcontext* c) { + if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); } + delete c; + }) { + CheckError(cuCtxCreate(context_.get(), 0, device())); + } + + // Accessor to the private data-member + const RawContext& operator()() const { return *context_; } + RawContext* pointer() const { return &(*context_); } +private: + std::shared_ptr<CUcontext> context_; +}; + +// Pointer to a raw CUDA context +using ContextPointer = CUcontext*; + +// ================================================================================================= + +// C++11 version of 'nvrtcProgram'. Additionally holds the program's source code. +class Program { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Source-based constructor with memory management + explicit Program(const Context &, std::string source): + program_(new nvrtcProgram, [](nvrtcProgram* p) { + if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); } + delete p; + }), + source_(std::move(source)), + from_binary_(false) { + const auto source_ptr = &source_[0]; + CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr)); + } + + // PTX-based constructor + explicit Program(const Device &device, const Context &context, const std::string &binary): + program_(nullptr), // not used + source_(binary), + from_binary_(true) { + } + + // Compiles the device program and checks whether or not there are any warnings/errors + void Build(const Device &, std::vector<std::string> &options) { + if (from_binary_) { return; } + auto raw_options = std::vector<const char*>(); + for (const auto &option: options) { + raw_options.push_back(option.c_str()); + } + auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data()); + CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram"); + } + + // Confirms whether a certain status code is an actual compilation error or warning + bool StatusIsCompilationWarningOrError(const nvrtcResult status) const { + return (status == NVRTC_ERROR_INVALID_INPUT); + } + + // Retrieves the warning/error message from the compiler (if any) + std::string GetBuildInfo(const Device &) const { + if (from_binary_) { return std::string{}; } + auto bytes = size_t{0}; + CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0])); + return result; + } + + // Retrieves an intermediate representation of the compiled program (i.e. PTX) + std::string GetIR() const { + if (from_binary_) { return source_; } // holds the PTX + auto bytes = size_t{0}; + CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0])); + return result; + } + + // Accessor to the private data-member + const nvrtcProgram& operator()() const { return *program_; } +private: + std::shared_ptr<nvrtcProgram> program_; + const std::string source_; + const bool from_binary_; +}; + +// ================================================================================================= + +// Raw command-queue type +using RawCommandQueue = CUstream; + +// C++11 version of 'CUstream' +class Queue { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Regular constructor with memory management + explicit Queue(const Context &context, const Device &device): + queue_(new CUstream, [](CUstream* s) { + if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); } + delete s; + }), + context_(context), + device_(device) { + CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING)); + } + + // Synchronizes the queue and optionally also an event + void Finish(Event &event) const { + CheckError(cuEventSynchronize(event.end())); + Finish(); + } + void Finish() const { + CheckError(cuStreamSynchronize(*queue_)); + } + + // Retrieves the corresponding context or device + Context GetContext() const { return context_; } + Device GetDevice() const { return device_; } + + // Accessor to the private data-member + const RawCommandQueue& operator()() const { return *queue_; } +private: + std::shared_ptr<CUstream> queue_; + const Context context_; + const Device device_; +}; + +// ================================================================================================= + +// C++11 version of page-locked host memory +template <typename T> +class BufferHost { +public: + + // Regular constructor with memory management + explicit BufferHost(const Context &, const size_t size): + buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }), + size_(size) { + CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T))); + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + return size_*sizeof(T); + } + + // Compatibility with std::vector + size_t size() const { return size_; } + T* begin() { return &static_cast<T*>(*buffer_)[0]; } + T* end() { return &static_cast<T*>(*buffer_)[size_-1]; } + T& operator[](const size_t i) { return static_cast<T*>(*buffer_)[i]; } + T* data() { return static_cast<T*>(*buffer_); } + const T* data() const { return static_cast<T*>(*buffer_); } + +private: + std::shared_ptr<void*> buffer_; + const size_t size_; +}; + +// ================================================================================================= + +// Enumeration of buffer access types +enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }; + +// C++11 version of 'CUdeviceptr' +template <typename T> +class Buffer { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Buffer(const CUdeviceptr buffer): + buffer_(new CUdeviceptr), + access_(BufferAccess::kNotOwned) { + *buffer_ = buffer; + } + + // Regular constructor with memory management. If this class does not own the buffer object, then + // the memory will not be freed automatically afterwards. + explicit Buffer(const Context &, const BufferAccess access, const size_t size): + buffer_(new CUdeviceptr, [access](CUdeviceptr* m) { + if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); } + delete m; + }), + access_(access) { + CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T))); + } + + // As above, but now with read/write access as a default + explicit Buffer(const Context &context, const size_t size): + Buffer<T>(context, BufferAccess::kReadWrite, size) { + } + + // Constructs a new buffer based on an existing host-container + template <typename Iterator> + explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end): + Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) { + auto size = static_cast<size_t>(end - start); + auto pointer = &*start; + CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue())); + queue.Finish(); + } + + // Copies from device to host: reading the device buffer a-synchronously + void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { + if (access_ == BufferAccess::kWriteOnly) { + throw LogicError("Buffer: reading from a write-only buffer"); + } + CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue())); + } + void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host, + const size_t offset = 0) const { + if (host.size() < size) { + throw LogicError("Buffer: target host buffer is too small"); + } + ReadAsync(queue, size, host.data(), offset); + } + void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host, + const size_t offset = 0) const { + if (host.size() < size) { + throw LogicError("Buffer: target host buffer is too small"); + } + ReadAsync(queue, size, host.data(), offset); + } + + // Copies from device to host: reading the device buffer + void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { + ReadAsync(queue, size, host, offset); + queue.Finish(); + } + void Read(const Queue &queue, const size_t size, std::vector<T> &host, + const size_t offset = 0) const { + Read(queue, size, host.data(), offset); + } + void Read(const Queue &queue, const size_t size, BufferHost<T> &host, + const size_t offset = 0) const { + Read(queue, size, host.data(), offset); + } + + // Copies from host to device: writing the device buffer a-synchronously + void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + if (access_ == BufferAccess::kReadOnly) { + throw LogicError("Buffer: writing to a read-only buffer"); + } + if (GetSize() < (offset+size)*sizeof(T)) { + throw LogicError("Buffer: target device buffer is too small"); + } + CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue())); + } + void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host, + const size_t offset = 0) { + WriteAsync(queue, size, host.data(), offset); + } + void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host, + const size_t offset = 0) { + WriteAsync(queue, size, host.data(), offset); + } + + // Copies from host to device: writing the device buffer + void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + WriteAsync(queue, size, host, offset); + queue.Finish(); + } + void Write(const Queue &queue, const size_t size, const std::vector<T> &host, + const size_t offset = 0) { + Write(queue, size, host.data(), offset); + } + void Write(const Queue &queue, const size_t size, const BufferHost<T> &host, + const size_t offset = 0) { + Write(queue, size, host.data(), offset); + } + + // Copies the contents of this buffer into another device buffer + void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const { + CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue())); + } + void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const { + CopyToAsync(queue, size, destination); + queue.Finish(); + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + auto result = size_t{0}; + CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_)); + return result; + } + + // Accessors to the private data-members + CUdeviceptr operator()() const { return *buffer_; } + CUdeviceptr& operator()() { return *buffer_; } +private: + std::shared_ptr<CUdeviceptr> buffer_; + const BufferAccess access_; +}; + +// ================================================================================================= + +// C++11 version of 'CUfunction' +class Kernel { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Kernel(const CUmodule module, const CUfunction kernel): + module_(module), + kernel_(kernel) { + } + + // Regular constructor with memory management + explicit Kernel(const Program &program, const std::string &name) { + CheckError(cuModuleLoadDataEx(&module_, program.GetIR().data(), 0, nullptr, nullptr)); + CheckError(cuModuleGetFunction(&kernel_, module_, name.c_str())); + } + + // Sets a kernel argument at the indicated position. This stores both the value of the argument + // (as raw bytes) and the index indicating where this value can be found. + template <typename T> + void SetArgument(const size_t index, const T &value) { + if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); } + arguments_indices_[index] = arguments_data_.size(); + for (auto j=size_t(0); j<sizeof(T); ++j) { + arguments_data_.push_back(reinterpret_cast<const char*>(&value)[j]); + } + } + template <typename T> + void SetArgument(const size_t index, Buffer<T> &value) { + SetArgument(index, value()); + } + + // Sets all arguments in one go using parameter packs. Note that this resets all previously set + // arguments using 'SetArgument' or 'SetArguments'. + template <typename... Args> + void SetArguments(Args&... args) { + arguments_indices_.clear(); + arguments_data_.clear(); + SetArgumentsRecursive(0, args...); + } + + // Retrieves the amount of local memory used per work-group for this kernel. Note that this the + // shared memory in CUDA terminology. + unsigned long LocalMemUsage(const Device &) const { + auto result = 0; + CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_)); + return static_cast<unsigned long>(result); + } + + // Retrieves the name of the kernel + std::string GetFunctionName() const { + return std::string{"unknown"}; // Not implemented for the CUDA backend + } + + // Launches a kernel onto the specified queue + void Launch(const Queue &queue, const std::vector<size_t> &global, + const std::vector<size_t> &local, EventPointer event) { + + // Creates the grid (number of threadblocks) and sets the block sizes (threads per block) + auto grid = std::vector<size_t>{1, 1, 1}; + auto block = std::vector<size_t>{1, 1, 1}; + if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); } + for (auto i=size_t{0}; i<local.size(); ++i) { grid[i] = global[i]/local[i]; } + for (auto i=size_t{0}; i<local.size(); ++i) { block[i] = local[i]; } + + // Creates the array of pointers from the arrays of indices & data + std::vector<void*> pointers; + for (auto &index: arguments_indices_) { + pointers.push_back(&arguments_data_[index]); + } + + // Launches the kernel, its execution time is recorded by events + CheckError(cuEventRecord(event->start(), queue())); + CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2], + 0, queue(), pointers.data(), nullptr)); + CheckError(cuEventRecord(event->end(), queue())); + } + + // As above, but with an event waiting list + // TODO: Implement this function + void Launch(const Queue &queue, const std::vector<size_t> &global, + const std::vector<size_t> &local, EventPointer event, + std::vector<Event>& waitForEvents) { + if (local.size() == 0) { + throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end"); + } + else if (waitForEvents.size() != 0) { + throw LogicError("Kernel: launching with an event waiting list is not implemented for the CUDA back-end"); + } + else { + return Launch(queue, global, local, event); + } + } + + // Accessors to the private data-members + const CUfunction& operator()() const { return kernel_; } + CUfunction operator()() { return kernel_; } +private: + CUmodule module_; + CUfunction kernel_; + std::vector<size_t> arguments_indices_; // Indices of the arguments + std::vector<char> arguments_data_; // The arguments data as raw bytes + + // Internal implementation for the recursive SetArguments function. + template <typename T> + void SetArgumentsRecursive(const size_t index, T &first) { + SetArgument(index, first); + } + template <typename T, typename... Args> + void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { + SetArgument(index, first); + SetArgumentsRecursive(index+1, args...); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_CUPP11_H_ +#endif |