// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API // calls. The main benefits are increased abstraction, automatic memory management, and portability. // Portability here means that a similar header exists for CUDA with the same classes and // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. // // This file is taken from the CLCudaAPI project and // therefore contains the following header copyright notice: // // ================================================================================================= // // Copyright 2015 SURFsara // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // ================================================================================================= #ifndef CLBLAST_CUPP11_H_ #define CLBLAST_CUPP11_H_ // C++ #include // std::copy #include // std::string #include // std::vector #include // std::shared_ptr #include // std::strlen // CUDA #define CUDA_NO_HALF // Incompatible with CLBlast's definition; TODO: resolve this #include // CUDA driver API #include // NVIDIA runtime compilation API // Exception classes #include "cxpp11_common.hpp" namespace clblast { // ================================================================================================= // Max-length of strings constexpr auto kStringLength = 256; // ================================================================================================= // Represents a runtime error returned by a CUDA driver API function class CLCudaAPIError : public ErrorCode { public: explicit CLCudaAPIError(CUresult status, const std::string &where): ErrorCode(status, where, "CUDA error: " + where + ": " + GetErrorName(status) + " --> " + GetErrorString(status)) { } static void Check(const CUresult status, const std::string &where) { if (status != CUDA_SUCCESS) { throw CLCudaAPIError(status, where); } } static void CheckDtor(const CUresult status, const std::string &where) { if (status != CUDA_SUCCESS) { fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what()); } } private: std::string GetErrorName(CUresult status) const { const char* status_code; cuGetErrorName(status, &status_code); return std::string(status_code); } std::string GetErrorString(CUresult status) const { const char* status_string; cuGetErrorString(status, &status_string); return std::string(status_string); } }; // Represents a runtime error returned by a CUDA runtime compilation API function class CLCudaAPINVRTCError : public ErrorCode { public: explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where): ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) { } static void Check(const nvrtcResult status, const std::string &where) { if (status != NVRTC_SUCCESS) { throw CLCudaAPINVRTCError(status, where); } } static void CheckDtor(const nvrtcResult status, const std::string &where) { if (status != NVRTC_SUCCESS) { fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what()); } } private: std::string GetErrorString(nvrtcResult status) const { const char* status_string = nvrtcGetErrorString(status); return std::string(status_string); } }; // Exception returned when building a program using CLCudaAPIBuildError = CLCudaAPINVRTCError; // ================================================================================================= // Error occurred in CUDA driver or runtime compilation API #define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call)) #define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call)) // Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors) #define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call)) #define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call)) // ================================================================================================= // C++11 version of two 'CUevent' pointers class Event { public: // Note that there is no constructor based on the regular CUDA data-type because of extra state // Regular constructor with memory management explicit Event(): start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }), end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) { CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT)); CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT)); } // Waits for completion of this event (not implemented for CUDA) void WaitForCompletion() const { } // not needed due to cuStreamSynchronize call after each kernel launch // Retrieves the elapsed time of the last recorded event float GetElapsedTime() const { auto result = 0.0f; cuEventElapsedTime(&result, *start_, *end_); return result; } // Accessors to the private data-members const CUevent& start() const { return *start_; } const CUevent& end() const { return *end_; } Event* pointer() { return this; } private: std::shared_ptr start_; std::shared_ptr end_; }; // Pointer to a CUDA event using EventPointer = Event*; // ================================================================================================= // Raw platform ID type using RawPlatformID = size_t; // The CUDA platform: initializes the CUDA driver API class Platform { public: // Initializes the platform. Note that the platform ID variable is not actually used for CUDA. explicit Platform(const size_t platform_id) : platform_id_(0) { if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); } CheckError(cuInit(0)); } // Methods to retrieve platform information std::string Name() const { return "CUDA"; } std::string Vendor() const { return "NVIDIA Corporation"; } std::string Version() const { auto result = 0; CheckError(cuDriverGetVersion(&result)); return "CUDA driver "+std::to_string(result); } // Returns the number of devices on this platform size_t NumDevices() const { auto result = 0; CheckError(cuDeviceGetCount(&result)); return static_cast(result); } // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0) const RawPlatformID& operator()() const { return platform_id_; } private: const size_t platform_id_; }; // Retrieves a vector with all platforms. Note that there is just one platform in CUDA. inline std::vector GetAllPlatforms() { auto all_platforms = std::vector{ Platform(size_t{0}) }; return all_platforms; } // ================================================================================================= // Raw device ID type using RawDeviceID = CUdevice; // C++11 version of 'CUdevice' class Device { public: // Constructor based on the regular CUDA data-type explicit Device(const CUdevice device): device_(device) { } // Initialization explicit Device(const Platform &platform, const size_t device_id) { auto num_devices = platform.NumDevices(); if (num_devices == 0) { throw RuntimeError("Device: no devices found"); } if (device_id >= num_devices) { throw RuntimeError("Device: invalid device ID "+std::to_string(device_id)); } CheckError(cuDeviceGet(&device_, device_id)); } // Methods to retrieve device information RawPlatformID PlatformID() const { return 0; } std::string Version() const { auto result = 0; CheckError(cuDriverGetVersion(&result)); return "CUDA driver "+std::to_string(result); } size_t VersionNumber() const { auto result = 0; CheckError(cuDriverGetVersion(&result)); return static_cast(result); } std::string Vendor() const { return "NVIDIA Corporation"; } std::string Name() const { auto result = std::string{}; result.resize(kStringLength); CheckError(cuDeviceGetName(&result[0], result.size(), device_)); result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters return result; } std::string Type() const { return "GPU"; } size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); } size_t MaxWorkItemDimensions() const { return size_t{3}; } std::vector MaxWorkItemSizes() const { return std::vector{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X), GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y), GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)}; } unsigned long LocalMemSize() const { return static_cast(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)); } std::string Capabilities() const { const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); return "SM"+std::to_string(major)+"."+std::to_string(minor); } std::string ComputeArch() const { const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); return "compute_"+std::to_string(major)+std::to_string(minor); } bool HasExtension(const std::string &extension) const { return false; } bool SupportsFP64() const { return true; } bool SupportsFP16() const { const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); if (major > 5) { return true; } // SM 6.x, 7.x and higher if (major == 5 && minor == 3) { return true; } // SM 5.3 return false; } size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); } size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); } unsigned long MemorySize() const { auto result = size_t{0}; CheckError(cuDeviceTotalMem(&result, device_)); return static_cast(result); } unsigned long MaxAllocSize() const { return MemorySize(); } size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); } size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); } // Configuration-validity checks bool IsLocalMemoryValid(const size_t local_mem_usage) const { return (local_mem_usage <= LocalMemSize()); } bool IsThreadConfigValid(const std::vector &local) const { auto local_size = size_t{1}; for (const auto &item: local) { local_size *= item; } for (auto i=size_t{0}; i MaxWorkItemSizes()[i]) { return false; } } if (local_size > MaxWorkGroupSize()) { return false; } if (local.size() > MaxWorkItemDimensions()) { return false; } return true; } // Query for a specific type of device or brand bool IsCPU() const { return false; } bool IsGPU() const { return true; } bool IsAMD() const { return false; } bool IsNVIDIA() const { return true; } bool IsIntel() const { return false; } bool IsARM() const { return false; } bool IsQualcomm() const { return false; } // Platform specific extensions std::string AMDBoardName() const { return ""; } std::string NVIDIAComputeCapability() const { return Capabilities(); } // Returns if the Nvidia chip is a Volta or later archicture (major version 7 or higher) bool IsPostNVIDIAVolta() const { return GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 7; } // Retrieves the above extra information std::string GetExtraInfo() const { return NVIDIAComputeCapability(); } // Accessor to the private data-member const RawDeviceID& operator()() const { return device_; } private: CUdevice device_; // Private helper function size_t GetInfo(const CUdevice_attribute info) const { auto result = 0; CheckError(cuDeviceGetAttribute(&result, info, device_)); return static_cast(result); } }; // ================================================================================================= // Raw context type using RawContext = CUcontext; // C++11 version of 'CUcontext' class Context { public: // Constructor based on the regular CUDA data-type: memory management is handled elsewhere explicit Context(const CUcontext context): context_(new CUcontext) { *context_ = context; } // Regular constructor with memory management explicit Context(const Device &device): context_(new CUcontext, [](CUcontext* c) { if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); } delete c; }) { CheckError(cuCtxCreate(context_.get(), 0, device())); } // Accessor to the private data-member const RawContext& operator()() const { return *context_; } RawContext* pointer() const { return &(*context_); } private: std::shared_ptr context_; }; // Pointer to a raw CUDA context using ContextPointer = CUcontext*; // ================================================================================================= // C++11 version of 'nvrtcProgram'. Additionally holds the program's source code. class Program { public: Program() = default; // Note that there is no constructor based on the regular CUDA data-type because of extra state // Source-based constructor with memory management explicit Program(const Context &, std::string source): program_(new nvrtcProgram, [](nvrtcProgram* p) { if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); } delete p; }), source_(std::move(source)), from_binary_(false) { const auto source_ptr = &source_[0]; CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr)); } // PTX-based constructor explicit Program(const Device &device, const Context &context, const std::string &binary): program_(nullptr), // not used source_(binary), from_binary_(true) { } // Compiles the device program and checks whether or not there are any warnings/errors void Build(const Device &device, std::vector &options) { options.push_back("-arch=" + device.ComputeArch()); if (from_binary_) { return; } auto raw_options = std::vector(); for (const auto &option: options) { raw_options.push_back(option.c_str()); } auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data()); CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram"); CheckError(cuModuleLoadDataEx(&module_, GetIR().data(), 0, nullptr, nullptr)); } // Confirms whether a certain status code is an actual compilation error or warning bool StatusIsCompilationWarningOrError(const nvrtcResult status) const { return (status == NVRTC_ERROR_COMPILATION); } // Retrieves the warning/error message from the compiler (if any) std::string GetBuildInfo(const Device &) const { if (from_binary_) { return std::string{}; } auto bytes = size_t{0}; CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes)); auto result = std::string{}; result.resize(bytes); CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0])); return result; } // Retrieves an intermediate representation of the compiled program (i.e. PTX) std::string GetIR() const { if (from_binary_) { return source_; } // holds the PTX auto bytes = size_t{0}; CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes)); auto result = std::string{}; result.resize(bytes); CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0])); return result; } // Accessor to the private data-members const CUmodule GetModule() const { return module_; } const nvrtcProgram& operator()() const { return *program_; } private: std::shared_ptr program_; CUmodule module_; std::string source_; bool from_binary_; }; // ================================================================================================= // Raw command-queue type using RawCommandQueue = CUstream; // C++11 version of 'CUstream' class Queue { public: // Note that there is no constructor based on the regular CUDA data-type because of extra state // Regular constructor with memory management explicit Queue(const Context &context, const Device &device): queue_(new CUstream, [](CUstream* s) { if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); } delete s; }), context_(context), device_(device) { CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING)); } // Synchronizes the queue and optionally also an event void Finish(Event &event) const { CheckError(cuEventSynchronize(event.end())); Finish(); } void Finish() const { CheckError(cuStreamSynchronize(*queue_)); } // Retrieves the corresponding context or device Context GetContext() const { return context_; } Device GetDevice() const { return device_; } // Accessor to the private data-member const RawCommandQueue& operator()() const { return *queue_; } private: std::shared_ptr queue_; const Context context_; const Device device_; }; // ================================================================================================= // C++11 version of page-locked host memory template class BufferHost { public: // Regular constructor with memory management explicit BufferHost(const Context &, const size_t size): buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }), size_(size) { CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T))); } // Retrieves the actual allocated size in bytes size_t GetSize() const { return size_*sizeof(T); } // Compatibility with std::vector size_t size() const { return size_; } T* begin() { return &static_cast(*buffer_)[0]; } T* end() { return &static_cast(*buffer_)[size_-1]; } T& operator[](const size_t i) { return static_cast(*buffer_)[i]; } T* data() { return static_cast(*buffer_); } const T* data() const { return static_cast(*buffer_); } private: std::shared_ptr buffer_; const size_t size_; }; // ================================================================================================= // Enumeration of buffer access types enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }; // C++11 version of 'CUdeviceptr' template class Buffer { public: // Constructor based on the regular CUDA data-type: memory management is handled elsewhere explicit Buffer(const CUdeviceptr buffer): buffer_(new CUdeviceptr), access_(BufferAccess::kNotOwned) { *buffer_ = buffer; } // Regular constructor with memory management. If this class does not own the buffer object, then // the memory will not be freed automatically afterwards. explicit Buffer(const Context &, const BufferAccess access, const size_t size): buffer_(new CUdeviceptr, [access, size](CUdeviceptr* m) { if (access != BufferAccess::kNotOwned && size > 0) { CheckError(cuMemFree(*m)); } delete m; }), access_(access) { if (size > 0) { CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T))); } } // As above, but now with read/write access as a default explicit Buffer(const Context &context, const size_t size): Buffer(context, BufferAccess::kReadWrite, size) { } // Constructs a new buffer based on an existing host-container template explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end): Buffer(context, BufferAccess::kReadWrite, static_cast(end - start)) { auto size = static_cast(end - start); auto pointer = &*start; CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue())); queue.Finish(); } // Copies from device to host: reading the device buffer a-synchronously void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { if (access_ == BufferAccess::kWriteOnly) { throw LogicError("Buffer: reading from a write-only buffer"); } CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue())); } void ReadAsync(const Queue &queue, const size_t size, std::vector &host, const size_t offset = 0) const { if (host.size() < size) { throw LogicError("Buffer: target host buffer is too small"); } ReadAsync(queue, size, host.data(), offset); } void ReadAsync(const Queue &queue, const size_t size, BufferHost &host, const size_t offset = 0) const { if (host.size() < size) { throw LogicError("Buffer: target host buffer is too small"); } ReadAsync(queue, size, host.data(), offset); } // Copies from device to host: reading the device buffer void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { ReadAsync(queue, size, host, offset); queue.Finish(); } void Read(const Queue &queue, const size_t size, std::vector &host, const size_t offset = 0) const { Read(queue, size, host.data(), offset); } void Read(const Queue &queue, const size_t size, BufferHost &host, const size_t offset = 0) const { Read(queue, size, host.data(), offset); } // Copies from host to device: writing the device buffer a-synchronously void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { if (access_ == BufferAccess::kReadOnly) { throw LogicError("Buffer: writing to a read-only buffer"); } if (GetSize() < (offset+size)*sizeof(T)) { throw LogicError("Buffer: target device buffer is too small"); } CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue())); } void WriteAsync(const Queue &queue, const size_t size, const std::vector &host, const size_t offset = 0) { WriteAsync(queue, size, host.data(), offset); } void WriteAsync(const Queue &queue, const size_t size, const BufferHost &host, const size_t offset = 0) { WriteAsync(queue, size, host.data(), offset); } // Copies from host to device: writing the device buffer void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { WriteAsync(queue, size, host, offset); queue.Finish(); } void Write(const Queue &queue, const size_t size, const std::vector &host, const size_t offset = 0) { Write(queue, size, host.data(), offset); } void Write(const Queue &queue, const size_t size, const BufferHost &host, const size_t offset = 0) { Write(queue, size, host.data(), offset); } // Copies the contents of this buffer into another device buffer void CopyToAsync(const Queue &queue, const size_t size, const Buffer &destination, EventPointer event = nullptr) const { CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue())); } void CopyTo(const Queue &queue, const size_t size, const Buffer &destination) const { CopyToAsync(queue, size, destination); queue.Finish(); } // Retrieves the actual allocated size in bytes size_t GetSize() const { auto result = size_t{0}; CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_)); return result; } // Accessors to the private data-members CUdeviceptr operator()() const { return *buffer_; } CUdeviceptr& operator()() { return *buffer_; } private: std::shared_ptr buffer_; BufferAccess access_; }; // ================================================================================================= // C++11 version of 'CUfunction' class Kernel { public: // Constructor based on the regular CUDA data-type: memory management is handled elsewhere explicit Kernel(const CUfunction kernel): name_("unknown"), kernel_(kernel) { } // Regular constructor with memory management explicit Kernel(const std::shared_ptr program, const std::string &name): name_(name) { CheckError(cuModuleGetFunction(&kernel_, program->GetModule(), name.c_str())); } // Sets a kernel argument at the indicated position. This stores both the value of the argument // (as raw bytes) and the index indicating where this value can be found. template void SetArgument(const size_t index, const T &value) { if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); } arguments_indices_[index] = arguments_data_.size(); for (auto j=size_t(0); j(&value)[j]); } } template void SetArgument(const size_t index, Buffer &value) { SetArgument(index, value()); } // Sets all arguments in one go using parameter packs. Note that this resets all previously set // arguments using 'SetArgument' or 'SetArguments'. template void SetArguments(Args&... args) { arguments_indices_.clear(); arguments_data_.clear(); SetArgumentsRecursive(0, args...); } // Retrieves the amount of local memory used per work-group for this kernel. Note that this the // shared memory in CUDA terminology. unsigned long LocalMemUsage(const Device &) const { auto result = 0; CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_)); return static_cast(result); } // Retrieves the name of the kernel std::string GetFunctionName() const { return name_; } // Launches a kernel onto the specified queue void Launch(const Queue &queue, const std::vector &global, const std::vector &local, EventPointer event) { // TODO: Currently this CUDA launch is always synchronous due to a cuStreamSynchronize call if (local.size() == 0) { throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end"); } // Creates the grid (number of threadblocks) and sets the block sizes (threads per block) auto grid = std::vector{1, 1, 1}; auto block = std::vector{1, 1, 1}; if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); } for (auto i=size_t{0}; i pointers; for (auto &index: arguments_indices_) { pointers.push_back(&arguments_data_[index]); } // Launches the kernel, its execution time is recorded by events if (event) { CheckError(cuEventRecord(event->start(), queue())); } CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, queue(), pointers.data(), nullptr)); cuStreamSynchronize(queue()); if (event) { CheckError(cuEventRecord(event->end(), queue())); } } // As above, but with an event waiting list void Launch(const Queue &queue, const std::vector &global, const std::vector &local, EventPointer event, const std::vector& waitForEvents) { for (auto &waitEvent : waitForEvents) { waitEvent.WaitForCompletion(); // note: doesn't do anything, every kernel call is synchronous } return Launch(queue, global, local, event); } // Accessors to the private data-members const CUfunction& operator()() const { return kernel_; } CUfunction operator()() { return kernel_; } private: const std::string name_; CUfunction kernel_; std::vector arguments_indices_; // Indices of the arguments std::vector arguments_data_; // The arguments data as raw bytes // Internal implementation for the recursive SetArguments function. template void SetArgumentsRecursive(const size_t index, T &first) { SetArgument(index, first); } template void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { SetArgument(index, first); SetArgumentsRecursive(index+1, args...); } }; // ================================================================================================= } // namespace clblast // CLBLAST_CUPP11_H_ #endif