diff options
Diffstat (limited to 'src')
27 files changed, 3687 insertions, 314 deletions
diff --git a/src/api_common.cpp b/src/api_common.cpp new file mode 100644 index 00000000..0d387cd9 --- /dev/null +++ b/src/api_common.cpp @@ -0,0 +1,169 @@ +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the common (non-OpenCL-specific) functions of the CLBlast API. +// +// ================================================================================================= + +#include <string> + +#include "utilities/utilities.hpp" +#include "cache.hpp" +#include "routines/routines.hpp" + +namespace clblast { +// ================================================================================================= + +// Clears the cache of stored binaries +StatusCode ClearCache() { + try { + ProgramCache::Instance().Invalidate(); + BinaryCache::Instance().Invalidate(); + } catch (...) { return DispatchException(); } + return StatusCode::kSuccess; +} + +template <typename Real, typename Complex> +void FillCacheForPrecision(Queue &queue) { + try { + + // Runs all the level 1 set-up functions + Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr); + Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr); + Xscal<Real>(queue, nullptr); Xscal<Complex>(queue, nullptr); + Xcopy<Real>(queue, nullptr); Xcopy<Complex>(queue, nullptr); + Xaxpy<Real>(queue, nullptr); Xaxpy<Complex>(queue, nullptr); + Xdot<Real>(queue, nullptr); + Xdotu<Complex>(queue, nullptr); + Xdotc<Complex>(queue, nullptr); + Xnrm2<Real>(queue, nullptr); Xnrm2<Complex>(queue, nullptr); + Xasum<Real>(queue, nullptr); Xasum<Complex>(queue, nullptr); + Xsum<Real>(queue, nullptr); Xsum<Complex>(queue, nullptr); + Xamax<Real>(queue, nullptr); Xamax<Complex>(queue, nullptr); + Xmax<Real>(queue, nullptr); Xmax<Complex>(queue, nullptr); + Xmin<Real>(queue, nullptr); Xmin<Complex>(queue, nullptr); + + // Runs all the level 2 set-up functions + Xgemv<Real>(queue, nullptr); Xgemv<Complex>(queue, nullptr); + Xgbmv<Real>(queue, nullptr); Xgbmv<Complex>(queue, nullptr); + Xhemv<Complex>(queue, nullptr); + Xhbmv<Complex>(queue, nullptr); + Xhpmv<Complex>(queue, nullptr); + Xsymv<Real>(queue, nullptr); + Xsbmv<Real>(queue, nullptr); + Xspmv<Real>(queue, nullptr); + Xtrmv<Real>(queue, nullptr); Xtrmv<Complex>(queue, nullptr); + Xtbmv<Real>(queue, nullptr); Xtbmv<Complex>(queue, nullptr); + Xtpmv<Real>(queue, nullptr); Xtpmv<Complex>(queue, nullptr); + Xger<Real>(queue, nullptr); + Xgeru<Complex>(queue, nullptr); + Xgerc<Complex>(queue, nullptr); + Xher<Complex,Real>(queue, nullptr); + Xhpr<Complex,Real>(queue, nullptr); + Xher2<Complex>(queue, nullptr); + Xhpr2<Complex>(queue, nullptr); + Xsyr<Real>(queue, nullptr); + Xspr<Real>(queue, nullptr); + Xsyr2<Real>(queue, nullptr); + Xspr2<Real>(queue, nullptr); + + // Runs all the level 3 set-up functions + Xgemm<Real>(queue, nullptr); Xgemm<Complex>(queue, nullptr); + Xsymm<Real>(queue, nullptr); Xsymm<Complex>(queue, nullptr); + Xhemm<Complex>(queue, nullptr); + Xsyrk<Real>(queue, nullptr); Xsyrk<Complex>(queue, nullptr); + Xherk<Complex,Real>(queue, nullptr); + Xsyr2k<Real>(queue, nullptr); Xsyr2k<Complex>(queue, nullptr); + Xher2k<Complex,Real>(queue, nullptr); + Xtrmm<Real>(queue, nullptr); Xtrmm<Complex>(queue, nullptr); + + // Runs all the non-BLAS set-up functions + Xomatcopy<Real>(queue, nullptr); Xomatcopy<Complex>(queue, nullptr); + + } catch(const RuntimeErrorCode &e) { + if (e.status() != StatusCode::kNoDoublePrecision && + e.status() != StatusCode::kNoHalfPrecision) { + throw; + } + } +} + +// Fills the cache with all binaries for a specific device +// TODO: Add half-precision FP16 set-up calls +StatusCode FillCache(const RawDeviceID device) { + try { + + // Creates a sample context and queue to match the normal routine calling conventions + auto device_cpp = Device(device); + auto context = Context(device_cpp); + auto queue = Queue(context, device_cpp); + + FillCacheForPrecision<float, float2>(queue); + FillCacheForPrecision<double, double2>(queue); + + } catch (...) { return DispatchException(); } + return StatusCode::kSuccess; +} + +// ================================================================================================= + +// Overrides the tuning parameters for this device-precision-kernel combination +StatusCode OverrideParameters(const RawDeviceID device, const std::string &kernel_name, + const Precision precision, + const std::unordered_map<std::string,size_t> ¶meters) { + try { + + // Retrieves the device name + const auto device_cpp = Device(device); + const auto platform_id = device_cpp.PlatformID(); + const auto device_name = GetDeviceName(device_cpp); + + // Retrieves the current database values to verify whether the new ones are complete + auto in_cache = false; + auto current_database = DatabaseCache::Instance().Get(DatabaseKeyRef{platform_id, device, precision, kernel_name}, &in_cache); + if (!in_cache) { + log_debug("Searching database for kernel '" + kernel_name + "'"); + current_database = Database(device_cpp, kernel_name, precision, {}); + } + + // Verifies the parameters size + const auto current_parameter_names = current_database.GetParameterNames(); + if (current_parameter_names.size() != parameters.size()) { + return StatusCode::kMissingOverrideParameter; + } + + // Retrieves the names and values separately and in the same order as the existing database + auto parameter_values = database::Params{0}; + auto i = size_t{0}; + for (const auto ¤t_param : current_parameter_names) { + if (parameters.find(current_param) == parameters.end()) { + return StatusCode::kMissingOverrideParameter; + } + const auto parameter_value = parameters.at(current_param); + parameter_values[i] = parameter_value; + ++i; + } + + // Creates a small custom database based on the provided parameters + const auto database_device = database::DatabaseDevice{database::kDeviceNameDefault, parameter_values}; + const auto database_architecture = database::DatabaseArchitecture{"default", {database_device}}; + const auto database_vendor = database::DatabaseVendor{database::kDeviceTypeAll, "default", {database_architecture}}; + const auto database_entry = database::DatabaseEntry{kernel_name, precision, current_parameter_names, {database_vendor}}; + const auto database_entries = std::vector<database::DatabaseEntry>{database_entry}; + const auto database = Database(device_cpp, kernel_name, precision, database_entries); + + // Removes the old database entry and stores the new one in the cache + DatabaseCache::Instance().Remove(DatabaseKey{platform_id, device, precision, kernel_name}); + DatabaseCache::Instance().Store(DatabaseKey{platform_id, device, precision, kernel_name}, Database(database)); + + } catch (...) { return DispatchException(); } + return StatusCode::kSuccess; +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/clblast.cpp b/src/clblast.cpp index 9f865a23..7d2c2cef 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -15,67 +15,9 @@ #include <string> -#include "cache.hpp" +#include "routines/routines.hpp" #include "clblast.h" -// BLAS level-1 includes -#include "routines/level1/xswap.hpp" -#include "routines/level1/xscal.hpp" -#include "routines/level1/xcopy.hpp" -#include "routines/level1/xaxpy.hpp" -#include "routines/level1/xdot.hpp" -#include "routines/level1/xdotu.hpp" -#include "routines/level1/xdotc.hpp" -#include "routines/level1/xnrm2.hpp" -#include "routines/level1/xasum.hpp" -#include "routines/level1/xsum.hpp" // non-BLAS routine -#include "routines/level1/xamax.hpp" -#include "routines/level1/xamin.hpp" // non-BLAS routine -#include "routines/level1/xmax.hpp" // non-BLAS routine -#include "routines/level1/xmin.hpp" // non-BLAS routine - -// BLAS level-2 includes -#include "routines/level2/xgemv.hpp" -#include "routines/level2/xgbmv.hpp" -#include "routines/level2/xhemv.hpp" -#include "routines/level2/xhbmv.hpp" -#include "routines/level2/xhpmv.hpp" -#include "routines/level2/xsymv.hpp" -#include "routines/level2/xsbmv.hpp" -#include "routines/level2/xspmv.hpp" -#include "routines/level2/xtrmv.hpp" -#include "routines/level2/xtbmv.hpp" -#include "routines/level2/xtpmv.hpp" -#include "routines/level2/xtrsv.hpp" -#include "routines/level2/xger.hpp" -#include "routines/level2/xgeru.hpp" -#include "routines/level2/xgerc.hpp" -#include "routines/level2/xher.hpp" -#include "routines/level2/xhpr.hpp" -#include "routines/level2/xher2.hpp" -#include "routines/level2/xhpr2.hpp" -#include "routines/level2/xsyr.hpp" -#include "routines/level2/xspr.hpp" -#include "routines/level2/xsyr2.hpp" -#include "routines/level2/xspr2.hpp" - -// BLAS level-3 includes -#include "routines/level3/xgemm.hpp" -#include "routines/level3/xsymm.hpp" -#include "routines/level3/xhemm.hpp" -#include "routines/level3/xsyrk.hpp" -#include "routines/level3/xherk.hpp" -#include "routines/level3/xsyr2k.hpp" -#include "routines/level3/xher2k.hpp" -#include "routines/level3/xtrmm.hpp" -#include "routines/level3/xtrsm.hpp" - -// Level-x includes (non-BLAS) -#include "routines/levelx/xomatcopy.hpp" -#include "routines/levelx/xim2col.hpp" -#include "routines/levelx/xaxpybatched.hpp" -#include "routines/levelx/xgemmbatched.hpp" - namespace clblast { // ================================================================================================= @@ -2389,153 +2331,6 @@ template StatusCode PUBLIC_API GemmBatched<half>(const Layout, const Transpose, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// Clears the cache of stored binaries -StatusCode ClearCache() { - try { - ProgramCache::Instance().Invalidate(); - BinaryCache::Instance().Invalidate(); - } catch (...) { return DispatchException(); } - return StatusCode::kSuccess; -} - -template <typename Real, typename Complex> -void FillCacheForPrecision(Queue &queue) { - try { - - // Runs all the level 1 set-up functions - Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr); - Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr); - Xscal<Real>(queue, nullptr); Xscal<Complex>(queue, nullptr); - Xcopy<Real>(queue, nullptr); Xcopy<Complex>(queue, nullptr); - Xaxpy<Real>(queue, nullptr); Xaxpy<Complex>(queue, nullptr); - Xdot<Real>(queue, nullptr); - Xdotu<Complex>(queue, nullptr); - Xdotc<Complex>(queue, nullptr); - Xnrm2<Real>(queue, nullptr); Xnrm2<Complex>(queue, nullptr); - Xasum<Real>(queue, nullptr); Xasum<Complex>(queue, nullptr); - Xsum<Real>(queue, nullptr); Xsum<Complex>(queue, nullptr); - Xamax<Real>(queue, nullptr); Xamax<Complex>(queue, nullptr); - Xmax<Real>(queue, nullptr); Xmax<Complex>(queue, nullptr); - Xmin<Real>(queue, nullptr); Xmin<Complex>(queue, nullptr); - - // Runs all the level 2 set-up functions - Xgemv<Real>(queue, nullptr); Xgemv<Complex>(queue, nullptr); - Xgbmv<Real>(queue, nullptr); Xgbmv<Complex>(queue, nullptr); - Xhemv<Complex>(queue, nullptr); - Xhbmv<Complex>(queue, nullptr); - Xhpmv<Complex>(queue, nullptr); - Xsymv<Real>(queue, nullptr); - Xsbmv<Real>(queue, nullptr); - Xspmv<Real>(queue, nullptr); - Xtrmv<Real>(queue, nullptr); Xtrmv<Complex>(queue, nullptr); - Xtbmv<Real>(queue, nullptr); Xtbmv<Complex>(queue, nullptr); - Xtpmv<Real>(queue, nullptr); Xtpmv<Complex>(queue, nullptr); - Xger<Real>(queue, nullptr); - Xgeru<Complex>(queue, nullptr); - Xgerc<Complex>(queue, nullptr); - Xher<Complex,Real>(queue, nullptr); - Xhpr<Complex,Real>(queue, nullptr); - Xher2<Complex>(queue, nullptr); - Xhpr2<Complex>(queue, nullptr); - Xsyr<Real>(queue, nullptr); - Xspr<Real>(queue, nullptr); - Xsyr2<Real>(queue, nullptr); - Xspr2<Real>(queue, nullptr); - - // Runs all the level 3 set-up functions - Xgemm<Real>(queue, nullptr); Xgemm<Complex>(queue, nullptr); - Xsymm<Real>(queue, nullptr); Xsymm<Complex>(queue, nullptr); - Xhemm<Complex>(queue, nullptr); - Xsyrk<Real>(queue, nullptr); Xsyrk<Complex>(queue, nullptr); - Xherk<Complex,Real>(queue, nullptr); - Xsyr2k<Real>(queue, nullptr); Xsyr2k<Complex>(queue, nullptr); - Xher2k<Complex,Real>(queue, nullptr); - Xtrmm<Real>(queue, nullptr); Xtrmm<Complex>(queue, nullptr); - - // Runs all the non-BLAS set-up functions - Xomatcopy<Real>(queue, nullptr); Xomatcopy<Complex>(queue, nullptr); - - } catch(const RuntimeErrorCode &e) { - if (e.status() != StatusCode::kNoDoublePrecision && - e.status() != StatusCode::kNoHalfPrecision) { - throw; - } - } -} - -// Fills the cache with all binaries for a specific device -// TODO: Add half-precision FP16 set-up calls -StatusCode FillCache(const cl_device_id device) { - try { - - // Creates a sample context and queue to match the normal routine calling conventions - auto device_cpp = Device(device); - auto context = Context(device_cpp); - auto queue = Queue(context, device_cpp); - - FillCacheForPrecision<float, float2>(queue); - FillCacheForPrecision<double, double2>(queue); - - } catch (...) { return DispatchException(); } - return StatusCode::kSuccess; -} - -// ================================================================================================= - -// Overrides the tuning parameters for this device-precision-kernel combination -StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name, - const Precision precision, - const std::unordered_map<std::string,size_t> ¶meters) { - try { - - // Retrieves the device name - const auto device_cpp = Device(device); - const auto platform_id = device_cpp.PlatformID(); - const auto device_name = GetDeviceName(device_cpp); - - // Retrieves the current database values to verify whether the new ones are complete - auto in_cache = false; - auto current_database = DatabaseCache::Instance().Get(DatabaseKeyRef{platform_id, device, precision, kernel_name}, &in_cache); - if (!in_cache) { - log_debug("Searching database for kernel '" + kernel_name + "'"); - current_database = Database(device_cpp, kernel_name, precision, {}); - } - - // Verifies the parameters size - const auto current_parameter_names = current_database.GetParameterNames(); - if (current_parameter_names.size() != parameters.size()) { - return StatusCode::kMissingOverrideParameter; - } - - // Retrieves the names and values separately and in the same order as the existing database - auto parameter_values = database::Params{0}; - auto i = size_t{0}; - for (const auto ¤t_param : current_parameter_names) { - if (parameters.find(current_param) == parameters.end()) { - return StatusCode::kMissingOverrideParameter; - } - const auto parameter_value = parameters.at(current_param); - parameter_values[i] = parameter_value; - ++i; - } - - // Creates a small custom database based on the provided parameters - const auto database_device = database::DatabaseDevice{database::kDeviceNameDefault, parameter_values}; - const auto database_architecture = database::DatabaseArchitecture{"default", {database_device}}; - const auto database_vendor = database::DatabaseVendor{database::kDeviceTypeAll, "default", {database_architecture}}; - const auto database_entry = database::DatabaseEntry{kernel_name, precision, current_parameter_names, {database_vendor}}; - const auto database_entries = std::vector<database::DatabaseEntry>{database_entry}; - const auto database = Database(device_cpp, kernel_name, precision, database_entries); - - // Removes the old database entry and stores the new one in the cache - DatabaseCache::Instance().Remove(DatabaseKey{platform_id, device, precision, kernel_name}); - DatabaseCache::Instance().Store(DatabaseKey{platform_id, device, precision, kernel_name}, Database(database)); - - } catch (...) { return DispatchException(); } - return StatusCode::kSuccess; -} // ================================================================================================= } // namespace clblast diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp new file mode 100644 index 00000000..0e3d949d --- /dev/null +++ b/src/clblast_cuda.cpp @@ -0,0 +1,2436 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements all the BLAS API calls (CUDA version). In all cases, it does not much more +// than creating a new object of the appropriate type, and calling the main routine on that object. +// It forwards all status codes to the caller. +// +// ================================================================================================= + +#include <string> + +#include "routines/routines.hpp" +#include "clblast_cuda.h" + +namespace clblast { + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Generate givens plane rotation: SROTG/DROTG +template <typename T> +StatusCode Rotg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotg<float>(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Rotg<double>(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); + +// Generate modified givens plane rotation: SROTMG/DROTMG +template <typename T> +StatusCode Rotmg(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotmg<float>(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Rotmg<double>(CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); + +// Apply givens plane rotation: SROT/DROT +template <typename T> +StatusCode Rot(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const T, + const T, + const CUcontext, const CUdevice) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rot<float>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const float, + const float, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Rot<double>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const double, + const double, + const CUcontext, const CUdevice); + +// Apply modified givens plane rotation: SROTM/DROTM +template <typename T> +StatusCode Rotm(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotm<float>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Rotm<double>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP +template <typename T> +StatusCode Swap(const size_t n, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xswap<T>(queue_cpp, nullptr); + routine.DoSwap(n, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Swap<float>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Swap<double>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Swap<float2>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Swap<double2>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Swap<half>(const size_t, + CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL +template <typename T> +StatusCode Scal(const size_t n, + const T alpha, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xscal<T>(queue_cpp, nullptr); + routine.DoScal(n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Scal<float>(const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Scal<double>(const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Scal<float2>(const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Scal<double2>(const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Scal<half>(const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY +template <typename T> +StatusCode Copy(const size_t n, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xcopy<T>(queue_cpp, nullptr); + routine.DoCopy(n, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Copy<float>(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Copy<double>(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Copy<float2>(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Copy<double2>(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Copy<half>(const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY +template <typename T> +StatusCode Axpy(const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xaxpy<T>(queue_cpp, nullptr); + routine.DoAxpy(n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Axpy<float>(const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Axpy<double>(const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Axpy<float2>(const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Axpy<double2>(const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Axpy<half>(const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Dot product of two vectors: SDOT/DDOT/HDOT +template <typename T> +StatusCode Dot(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xdot<T>(queue_cpp, nullptr); + routine.DoDot(n, + Buffer<T>(dot_buffer), dot_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Dot<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Dot<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Dot<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Dot product of two complex vectors: CDOTU/ZDOTU +template <typename T> +StatusCode Dotu(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xdotu<T>(queue_cpp, nullptr); + routine.DoDotu(n, + Buffer<T>(dot_buffer), dot_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Dotu<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Dotu<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +template <typename T> +StatusCode Dotc(const size_t n, + CUdeviceptr dot_buffer, const size_t dot_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xdotc<T>(queue_cpp, nullptr); + routine.DoDotc(n, + Buffer<T>(dot_buffer), dot_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Dotc<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Dotc<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 +template <typename T> +StatusCode Nrm2(const size_t n, + CUdeviceptr nrm2_buffer, const size_t nrm2_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xnrm2<T>(queue_cpp, nullptr); + routine.DoNrm2(n, + Buffer<T>(nrm2_buffer), nrm2_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Nrm2<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Nrm2<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Nrm2<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Nrm2<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Nrm2<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM +template <typename T> +StatusCode Asum(const size_t n, + CUdeviceptr asum_buffer, const size_t asum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xasum<T>(queue_cpp, nullptr); + routine.DoAsum(n, + Buffer<T>(asum_buffer), asum_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Asum<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Asum<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Asum<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Asum<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Asum<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM +template <typename T> +StatusCode Sum(const size_t n, + CUdeviceptr sum_buffer, const size_t sum_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xsum<T>(queue_cpp, nullptr); + routine.DoSum(n, + Buffer<T>(sum_buffer), sum_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Sum<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Sum<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Sum<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Sum<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Sum<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +template <typename T> +StatusCode Amax(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xamax<T>(queue_cpp, nullptr); + routine.DoAmax(n, + Buffer<unsigned int>(imax_buffer), imax_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Amax<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Amax<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Amax<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Amax<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Amax<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +template <typename T> +StatusCode Amin(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xamin<T>(queue_cpp, nullptr); + routine.DoAmin(n, + Buffer<unsigned int>(imin_buffer), imin_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Amin<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Amin<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Amin<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Amin<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Amin<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +template <typename T> +StatusCode Max(const size_t n, + CUdeviceptr imax_buffer, const size_t imax_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xmax<T>(queue_cpp, nullptr); + routine.DoMax(n, + Buffer<unsigned int>(imax_buffer), imax_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Max<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Max<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Max<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Max<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Max<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN +template <typename T> +StatusCode Min(const size_t n, + CUdeviceptr imin_buffer, const size_t imin_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xmin<T>(queue_cpp, nullptr); + routine.DoMin(n, + Buffer<unsigned int>(imin_buffer), imin_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Min<float>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Min<double>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Min<float2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Min<double2>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Min<half>(const size_t, + CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV +template <typename T> +StatusCode Gemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xgemv<T>(queue_cpp, nullptr); + routine.DoGemv(layout, a_transpose, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gemv<float>(const Layout, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gemv<double>(const Layout, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gemv<float2>(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gemv<double2>(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gemv<half>(const Layout, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV +template <typename T> +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xgbmv<T>(queue_cpp, nullptr); + routine.DoGbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gbmv<float>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gbmv<double>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gbmv<float2>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gbmv<double2>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gbmv<half>(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +template <typename T> +StatusCode Hemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xhemv<T>(queue_cpp, nullptr); + routine.DoHemv(layout, triangle, + n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hemv<float2>(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Hemv<double2>(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template <typename T> +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xhbmv<T>(queue_cpp, nullptr); + routine.DoHbmv(layout, triangle, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hbmv<float2>(const Layout, const Triangle, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Hbmv<double2>(const Layout, const Triangle, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template <typename T> +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xhpmv<T>(queue_cpp, nullptr); + routine.DoHpmv(layout, triangle, + n, + alpha, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hpmv<float2>(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Hpmv<double2>(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV +template <typename T> +StatusCode Symv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xsymv<T>(queue_cpp, nullptr); + routine.DoSymv(layout, triangle, + n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Symv<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Symv<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Symv<half>(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV +template <typename T> +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xsbmv<T>(queue_cpp, nullptr); + routine.DoSbmv(layout, triangle, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Sbmv<float>(const Layout, const Triangle, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Sbmv<double>(const Layout, const Triangle, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Sbmv<half>(const Layout, const Triangle, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV +template <typename T> +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr ap_buffer, const size_t ap_offset, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xspmv<T>(queue_cpp, nullptr); + routine.DoSpmv(layout, triangle, + n, + alpha, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Spmv<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Spmv<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Spmv<half>(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV +template <typename T> +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xtrmv<T>(queue_cpp, nullptr); + routine.DoTrmv(layout, triangle, a_transpose, diagonal, + n, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trmv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trmv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trmv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trmv<half>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV +template <typename T> +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xtbmv<T>(queue_cpp, nullptr); + routine.DoTbmv(layout, triangle, a_transpose, diagonal, + n, k, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tbmv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tbmv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tbmv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tbmv<half>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV +template <typename T> +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr ap_buffer, const size_t ap_offset, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xtpmv<T>(queue_cpp, nullptr); + routine.DoTpmv(layout, triangle, a_transpose, diagonal, + n, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tpmv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tpmv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tpmv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template <typename T> +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xtrsv<T>(queue_cpp, nullptr); + routine.DoTrsv(layout, triangle, a_transpose, diagonal, + n, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trsv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trsv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trsv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trsv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template <typename T> +StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Tbsv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tbsv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tbsv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tbsv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template <typename T> +StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Tpsv<float>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tpsv<double>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tpsv<float2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Tpsv<double2>(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// General rank-1 matrix update: SGER/DGER/HGER +template <typename T> +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xger<T>(queue_cpp, nullptr); + routine.DoGer(layout, + m, n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Ger<float>(const Layout, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Ger<double>(const Layout, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Ger<half>(const Layout, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// General rank-1 complex matrix update: CGERU/ZGERU +template <typename T> +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xgeru<T>(queue_cpp, nullptr); + routine.DoGeru(layout, + m, n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Geru<float2>(const Layout, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Geru<double2>(const Layout, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template <typename T> +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xgerc<T>(queue_cpp, nullptr); + routine.DoGerc(layout, + m, n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gerc<float2>(const Layout, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gerc<double2>(const Layout, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Hermitian rank-1 matrix update: CHER/ZHER +template <typename T> +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xher<std::complex<T>,T>(queue_cpp, nullptr); + routine.DoHer(layout, triangle, + n, + alpha, + Buffer<std::complex<T>>(x_buffer), x_offset, x_inc, + Buffer<std::complex<T>>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Her<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Her<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template <typename T> +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xhpr<std::complex<T>,T>(queue_cpp, nullptr); + routine.DoHpr(layout, triangle, + n, + alpha, + Buffer<std::complex<T>>(x_buffer), x_offset, x_inc, + Buffer<std::complex<T>>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hpr<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Hpr<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template <typename T> +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xher2<T>(queue_cpp, nullptr); + routine.DoHer2(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Her2<float2>(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Her2<double2>(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template <typename T> +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xhpr2<T>(queue_cpp, nullptr); + routine.DoHpr2(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hpr2<float2>(const Layout, const Triangle, + const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Hpr2<double2>(const Layout, const Triangle, + const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); + +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR +template <typename T> +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xsyr<T>(queue_cpp, nullptr); + routine.DoSyr(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syr<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Syr<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Syr<half>(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR +template <typename T> +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xspr<T>(queue_cpp, nullptr); + routine.DoSpr(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Spr<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Spr<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Spr<half>(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 +template <typename T> +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xsyr2<T>(queue_cpp, nullptr); + routine.DoSyr2(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(a_buffer), a_offset, a_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syr2<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Syr2<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Syr2<half>(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 +template <typename T> +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, + const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, + CUdeviceptr ap_buffer, const size_t ap_offset, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xspr2<T>(queue_cpp, nullptr); + routine.DoSpr2(layout, triangle, + n, + alpha, + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc, + Buffer<T>(ap_buffer), ap_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Spr2<float>(const Layout, const Triangle, + const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Spr2<double>(const Layout, const Triangle, + const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Spr2<half>(const Layout, const Triangle, + const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM +template <typename T> +StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xgemm<T>(queue_cpp, nullptr); + routine.DoGemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Gemm<float>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gemm<double>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gemm<float2>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gemm<double2>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Gemm<half>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM +template <typename T> +StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xsymm<T>(queue_cpp, nullptr); + routine.DoSymm(layout, side, triangle, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Symm<float>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Symm<double>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Symm<float2>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Symm<double2>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Symm<half>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM +template <typename T> +StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xhemm<T>(queue_cpp, nullptr); + routine.DoHemm(layout, side, triangle, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Hemm<float2>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Hemm<double2>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK +template <typename T> +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xsyrk<T>(queue_cpp, nullptr); + routine.DoSyrk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syrk<float>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Syrk<double>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Syrk<float2>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Syrk<double2>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Syrk<half>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Rank-K update of a hermitian matrix: CHERK/ZHERK +template <typename T> +StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xherk<std::complex<T>,T>(queue_cpp, nullptr); + routine.DoHerk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer<std::complex<T>>(a_buffer), a_offset, a_ld, + beta, + Buffer<std::complex<T>>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Herk<float>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Herk<double>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K +template <typename T> +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xsyr2k<T>(queue_cpp, nullptr); + routine.DoSyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Syr2k<float>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Syr2k<double>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Syr2k<float2>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Syr2k<double2>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double2, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Syr2k<half>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const half, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K +template <typename T, typename U> +StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xher2k<T,U>(queue_cpp, nullptr); + routine.DoHer2k(layout, triangle, ab_transpose, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, + beta, + Buffer<T>(c_buffer), c_offset, c_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Her2k<float2,float>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const float, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Her2k<double2,double>(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + const CUdeviceptr, const size_t, const size_t, + const double, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM +template <typename T> +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xtrmm<T>(queue_cpp, nullptr); + routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trmm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trmm<double>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trmm<float2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trmm<double2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +template <typename T> +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xtrsm<T>(queue_cpp, nullptr); + routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Trsm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trsm<double>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trsm<float2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +template <typename T> +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, + CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xomatcopy<T>(queue_cpp, nullptr); + routine.DoOmatcopy(layout, a_transpose, + m, n, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Omatcopy<float>(const Layout, const Transpose, + const size_t, const size_t, + const float, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Omatcopy<double>(const Layout, const Transpose, + const size_t, const size_t, + const double, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Omatcopy<float2>(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Omatcopy<double2>(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose, + const size_t, const size_t, + const half, + const CUdeviceptr, const size_t, const size_t, + CUdeviceptr, const size_t, const size_t, + const CUcontext, const CUdevice); + +// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL +template <typename T> +StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, + const CUdeviceptr im_buffer, const size_t im_offset, + CUdeviceptr col_buffer, const size_t col_offset, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = Xim2col<T>(queue_cpp, nullptr); + routine.DoIm2col(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + Buffer<T>(im_buffer), im_offset, + Buffer<T>(col_buffer), col_offset); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API Im2col<float>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Im2col<double>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Im2col<float2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Im2col<double2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Im2col<half>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); + +// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +template <typename T> +StatusCode AxpyBatched(const size_t n, + const T *alphas, + const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, + CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = XaxpyBatched<T>(queue_cpp, nullptr); + auto alphas_cpp = std::vector<T>(); + auto x_offsets_cpp = std::vector<size_t>(); + auto y_offsets_cpp = std::vector<size_t>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + x_offsets_cpp.push_back(x_offsets[batch]); + y_offsets_cpp.push_back(y_offsets[batch]); + } + routine.DoAxpyBatched(n, + alphas_cpp, + Buffer<T>(x_buffer), x_offsets_cpp, x_inc, + Buffer<T>(y_buffer), y_offsets_cpp, y_inc, + batch_count); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API AxpyBatched<float>(const size_t, + const float*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API AxpyBatched<double>(const size_t, + const double*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t, + const float2*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t, + const double2*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API AxpyBatched<half>(const size_t, + const half*, + const CUdeviceptr, const size_t*, const size_t, + CUdeviceptr, const size_t*, const size_t, + const size_t, + const CUcontext, const CUdevice); + +// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED +template <typename T> +StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T *alphas, + const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld, + const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld, + const T *betas, + CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, + const size_t batch_count, + const CUcontext context, const CUdevice device) { + try { + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); + auto routine = XgemmBatched<T>(queue_cpp, nullptr); + auto alphas_cpp = std::vector<T>(); + auto betas_cpp = std::vector<T>(); + auto a_offsets_cpp = std::vector<size_t>(); + auto b_offsets_cpp = std::vector<size_t>(); + auto c_offsets_cpp = std::vector<size_t>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + betas_cpp.push_back(betas[batch]); + a_offsets_cpp.push_back(a_offsets[batch]); + b_offsets_cpp.push_back(b_offsets[batch]); + c_offsets_cpp.push_back(c_offsets[batch]); + } + routine.DoGemmBatched(layout, a_transpose, b_transpose, + m, n, k, + alphas_cpp, + Buffer<T>(a_buffer), a_offsets_cpp, a_ld, + Buffer<T>(b_buffer), b_offsets_cpp, b_ld, + betas_cpp, + Buffer<T>(c_buffer), c_offsets_cpp, c_ld, + batch_count); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API GemmBatched<float>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const float*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API GemmBatched<double>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const double*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API GemmBatched<float2>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float2*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const float2*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API GemmBatched<double2>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double2*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const double2*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API GemmBatched<half>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const half*, + const CUdeviceptr, const size_t*, const size_t, + const CUdeviceptr, const size_t*, const size_t, + const half*, + CUdeviceptr, const size_t*, const size_t, + const size_t, + const CUcontext, const CUdevice); + +// ================================================================================================= +} // namespace clblast diff --git a/src/clpp11.hpp b/src/clpp11.hpp index 97045644..2335caef 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -668,6 +668,9 @@ class Buffer { // Copies from host to device: writing the device buffer a-synchronously void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + if (access_ == BufferAccess::kReadOnly) { + throw LogicError("Buffer: writing to a read-only buffer"); + } if (GetSize() < (offset+size)*sizeof(T)) { throw LogicError("Buffer: target device buffer is too small"); } diff --git a/src/cupp11.hpp b/src/cupp11.hpp new file mode 100644 index 00000000..ec21c5b1 --- /dev/null +++ b/src/cupp11.hpp @@ -0,0 +1,782 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API +// calls. The main benefits are increased abstraction, automatic memory management, and portability. +// Portability here means that a similar header exists for CUDA with the same classes and +// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. +// +// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and +// therefore contains the following header copyright notice: +// +// ================================================================================================= +// +// Copyright 2015 SURFsara +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ================================================================================================= + +#ifndef CLBLAST_CUPP11_H_ +#define CLBLAST_CUPP11_H_ + +// C++ +#include <algorithm> // std::copy +#include <string> // std::string +#include <vector> // std::vector +#include <memory> // std::shared_ptr +#include <cstring> // std::strlen + +// CUDA +#define CUDA_NO_HALF // Incompatible with CLBlast's definition; TODO: resolve this +#include <cuda.h> // CUDA driver API +#include <nvrtc.h> // NVIDIA runtime compilation API + +// Exception classes +#include "cxpp11_common.hpp" + +namespace clblast { +// ================================================================================================= + +// Max-length of strings +constexpr auto kStringLength = 256; + +// ================================================================================================= + +// Represents a runtime error returned by a CUDA driver API function +class CLCudaAPIError : public ErrorCode<DeviceError, CUresult> { +public: + explicit CLCudaAPIError(CUresult status, const std::string &where): + ErrorCode(status, where, "CUDA error: " + where + ": " + + GetErrorName(status) + " --> " + GetErrorString(status)) { + } + + static void Check(const CUresult status, const std::string &where) { + if (status != CUDA_SUCCESS) { + throw CLCudaAPIError(status, where); + } + } + + static void CheckDtor(const CUresult status, const std::string &where) { + if (status != CUDA_SUCCESS) { + fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what()); + } + } + +private: + std::string GetErrorName(CUresult status) const { + const char* status_code; + cuGetErrorName(status, &status_code); + return std::string(status_code); + } + std::string GetErrorString(CUresult status) const { + const char* status_string; + cuGetErrorString(status, &status_string); + return std::string(status_string); + } +}; + +// Represents a runtime error returned by a CUDA runtime compilation API function +class CLCudaAPINVRTCError : public ErrorCode<DeviceError, nvrtcResult> { +public: + explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where): + ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) { + } + + static void Check(const nvrtcResult status, const std::string &where) { + if (status != NVRTC_SUCCESS) { + throw CLCudaAPINVRTCError(status, where); + } + } + + static void CheckDtor(const nvrtcResult status, const std::string &where) { + if (status != NVRTC_SUCCESS) { + fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what()); + } + } + +private: + std::string GetErrorString(nvrtcResult status) const { + const char* status_string = nvrtcGetErrorString(status); + return std::string(status_string); + } +}; + +// Exception returned when building a program +using CLCudaAPIBuildError = CLCudaAPINVRTCError; + +// ================================================================================================= + +// Error occurred in CUDA driver or runtime compilation API +#define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call)) +#define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call)) + +// Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors) +#define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call)) +#define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call)) + +// ================================================================================================= + +// C++11 version of two 'CUevent' pointers +class Event { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Regular constructor with memory management + explicit Event(): + start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }), + end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) { + CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT)); + CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT)); + } + + // Waits for completion of this event (not implemented for CUDA) + void WaitForCompletion() const { } // not needed due to cuStreamSynchronize call after each kernel launch + + // Retrieves the elapsed time of the last recorded event + float GetElapsedTime() const { + auto result = 0.0f; + cuEventElapsedTime(&result, *start_, *end_); + return result; + } + + // Accessors to the private data-members + const CUevent& start() const { return *start_; } + const CUevent& end() const { return *end_; } + Event* pointer() { return this; } +private: + std::shared_ptr<CUevent> start_; + std::shared_ptr<CUevent> end_; +}; + +// Pointer to a CUDA event +using EventPointer = Event*; + +// ================================================================================================= + +// Raw platform ID type +using RawPlatformID = size_t; + +// The CUDA platform: initializes the CUDA driver API +class Platform { +public: + + // Initializes the platform. Note that the platform ID variable is not actually used for CUDA. + explicit Platform(const size_t platform_id) : platform_id_(0) { + if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); } + CheckError(cuInit(0)); + } + + // Methods to retrieve platform information + std::string Name() const { return "CUDA"; } + std::string Vendor() const { return "NVIDIA Corporation"; } + std::string Version() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return "CUDA driver "+std::to_string(result); + } + + // Returns the number of devices on this platform + size_t NumDevices() const { + auto result = 0; + CheckError(cuDeviceGetCount(&result)); + return static_cast<size_t>(result); + } + + // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0) + const RawPlatformID& operator()() const { return platform_id_; } +private: + const size_t platform_id_; +}; + +// Retrieves a vector with all platforms. Note that there is just one platform in CUDA. +inline std::vector<Platform> GetAllPlatforms() { + auto all_platforms = std::vector<Platform>{ Platform(size_t{0}) }; + return all_platforms; +} + +// ================================================================================================= + +// Raw device ID type +using RawDeviceID = CUdevice; + +// C++11 version of 'CUdevice' +class Device { +public: + + // Constructor based on the regular CUDA data-type + explicit Device(const CUdevice device): device_(device) { } + + // Initialization + explicit Device(const Platform &platform, const size_t device_id) { + auto num_devices = platform.NumDevices(); + if (num_devices == 0) { + throw RuntimeError("Device: no devices found"); + } + if (device_id >= num_devices) { + throw RuntimeError("Device: invalid device ID "+std::to_string(device_id)); + } + + CheckError(cuDeviceGet(&device_, device_id)); + } + + // Methods to retrieve device information + RawPlatformID PlatformID() const { return 0; } + std::string Version() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return "CUDA driver "+std::to_string(result); + } + size_t VersionNumber() const { + auto result = 0; + CheckError(cuDriverGetVersion(&result)); + return static_cast<size_t>(result); + } + std::string Vendor() const { return "NVIDIA Corporation"; } + std::string Name() const { + auto result = std::string{}; + result.resize(kStringLength); + CheckError(cuDeviceGetName(&result[0], result.size(), device_)); + result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters + return result; + } + std::string Type() const { return "GPU"; } + size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); } + size_t MaxWorkItemDimensions() const { return size_t{3}; } + std::vector<size_t> MaxWorkItemSizes() const { + return std::vector<size_t>{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X), + GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y), + GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)}; + } + unsigned long LocalMemSize() const { + return static_cast<unsigned long>(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)); + } + + std::string Capabilities() const { + const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + return "SM"+std::to_string(major)+"."+std::to_string(minor); + } + std::string ComputeArch() const { + const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + return "compute_"+std::to_string(major)+std::to_string(minor); + } + bool HasExtension(const std::string &extension) const { return false; } + bool SupportsFP64() const { return true; } + bool SupportsFP16() const { + const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + if (major > 5) { return true; } // SM 6.x, 7.x and higher + if (major == 5 && minor == 3) { return true; } // SM 5.3 + return false; + } + + size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); } + size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); } + unsigned long MemorySize() const { + auto result = size_t{0}; + CheckError(cuDeviceTotalMem(&result, device_)); + return static_cast<unsigned long>(result); + } + unsigned long MaxAllocSize() const { return MemorySize(); } + size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); } + size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); } + + // Configuration-validity checks + bool IsLocalMemoryValid(const size_t local_mem_usage) const { + return (local_mem_usage <= LocalMemSize()); + } + bool IsThreadConfigValid(const std::vector<size_t> &local) const { + auto local_size = size_t{1}; + for (const auto &item: local) { local_size *= item; } + for (auto i=size_t{0}; i<local.size(); ++i) { + if (local[i] > MaxWorkItemSizes()[i]) { return false; } + } + if (local_size > MaxWorkGroupSize()) { return false; } + if (local.size() > MaxWorkItemDimensions()) { return false; } + return true; + } + + // Query for a specific type of device or brand + bool IsCPU() const { return false; } + bool IsGPU() const { return true; } + bool IsAMD() const { return false; } + bool IsNVIDIA() const { return true; } + bool IsIntel() const { return false; } + bool IsARM() const { return false; } + + // Platform specific extensions + std::string AMDBoardName() const { return ""; } + std::string NVIDIAComputeCapability() const { return Capabilities(); } + + // Accessor to the private data-member + const RawDeviceID& operator()() const { return device_; } +private: + CUdevice device_; + + // Private helper function + size_t GetInfo(const CUdevice_attribute info) const { + auto result = 0; + CheckError(cuDeviceGetAttribute(&result, info, device_)); + return static_cast<size_t>(result); + } +}; + +// ================================================================================================= + +// Raw context type +using RawContext = CUcontext; + +// C++11 version of 'CUcontext' +class Context { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Context(const CUcontext context): + context_(new CUcontext) { + *context_ = context; + } + + // Regular constructor with memory management + explicit Context(const Device &device): + context_(new CUcontext, [](CUcontext* c) { + if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); } + delete c; + }) { + CheckError(cuCtxCreate(context_.get(), 0, device())); + } + + // Accessor to the private data-member + const RawContext& operator()() const { return *context_; } + RawContext* pointer() const { return &(*context_); } +private: + std::shared_ptr<CUcontext> context_; +}; + +// Pointer to a raw CUDA context +using ContextPointer = CUcontext*; + +// ================================================================================================= + +// C++11 version of 'nvrtcProgram'. Additionally holds the program's source code. +class Program { +public: + Program() = default; + + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Source-based constructor with memory management + explicit Program(const Context &, std::string source): + program_(new nvrtcProgram, [](nvrtcProgram* p) { + if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); } + delete p; + }), + source_(std::move(source)), + from_binary_(false) { + const auto source_ptr = &source_[0]; + CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr)); + } + + // PTX-based constructor + explicit Program(const Device &device, const Context &context, const std::string &binary): + program_(nullptr), // not used + source_(binary), + from_binary_(true) { + } + + // Compiles the device program and checks whether or not there are any warnings/errors + void Build(const Device &device, std::vector<std::string> &options) { + options.push_back("-arch=" + device.ComputeArch()); + if (from_binary_) { return; } + auto raw_options = std::vector<const char*>(); + for (const auto &option: options) { + raw_options.push_back(option.c_str()); + } + auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data()); + CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram"); + CheckError(cuModuleLoadDataEx(&module_, GetIR().data(), 0, nullptr, nullptr)); + } + + // Confirms whether a certain status code is an actual compilation error or warning + bool StatusIsCompilationWarningOrError(const nvrtcResult status) const { + return (status == NVRTC_ERROR_COMPILATION); + } + + // Retrieves the warning/error message from the compiler (if any) + std::string GetBuildInfo(const Device &) const { + if (from_binary_) { return std::string{}; } + auto bytes = size_t{0}; + CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0])); + return result; + } + + // Retrieves an intermediate representation of the compiled program (i.e. PTX) + std::string GetIR() const { + if (from_binary_) { return source_; } // holds the PTX + auto bytes = size_t{0}; + CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0])); + return result; + } + + // Accessor to the private data-members + const CUmodule GetModule() const { return module_; } + const nvrtcProgram& operator()() const { return *program_; } +private: + std::shared_ptr<nvrtcProgram> program_; + CUmodule module_; + std::string source_; + bool from_binary_; +}; + +// ================================================================================================= + +// Raw command-queue type +using RawCommandQueue = CUstream; + +// C++11 version of 'CUstream' +class Queue { +public: + // Note that there is no constructor based on the regular CUDA data-type because of extra state + + // Regular constructor with memory management + explicit Queue(const Context &context, const Device &device): + queue_(new CUstream, [](CUstream* s) { + if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); } + delete s; + }), + context_(context), + device_(device) { + CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING)); + } + + // Synchronizes the queue and optionally also an event + void Finish(Event &event) const { + CheckError(cuEventSynchronize(event.end())); + Finish(); + } + void Finish() const { + CheckError(cuStreamSynchronize(*queue_)); + } + + // Retrieves the corresponding context or device + Context GetContext() const { return context_; } + Device GetDevice() const { return device_; } + + // Accessor to the private data-member + const RawCommandQueue& operator()() const { return *queue_; } +private: + std::shared_ptr<CUstream> queue_; + const Context context_; + const Device device_; +}; + +// ================================================================================================= + +// C++11 version of page-locked host memory +template <typename T> +class BufferHost { +public: + + // Regular constructor with memory management + explicit BufferHost(const Context &, const size_t size): + buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }), + size_(size) { + CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T))); + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + return size_*sizeof(T); + } + + // Compatibility with std::vector + size_t size() const { return size_; } + T* begin() { return &static_cast<T*>(*buffer_)[0]; } + T* end() { return &static_cast<T*>(*buffer_)[size_-1]; } + T& operator[](const size_t i) { return static_cast<T*>(*buffer_)[i]; } + T* data() { return static_cast<T*>(*buffer_); } + const T* data() const { return static_cast<T*>(*buffer_); } + +private: + std::shared_ptr<void*> buffer_; + const size_t size_; +}; + +// ================================================================================================= + +// Enumeration of buffer access types +enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned }; + +// C++11 version of 'CUdeviceptr' +template <typename T> +class Buffer { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Buffer(const CUdeviceptr buffer): + buffer_(new CUdeviceptr), + access_(BufferAccess::kNotOwned) { + *buffer_ = buffer; + } + + // Regular constructor with memory management. If this class does not own the buffer object, then + // the memory will not be freed automatically afterwards. + explicit Buffer(const Context &, const BufferAccess access, const size_t size): + buffer_(new CUdeviceptr, [access](CUdeviceptr* m) { + if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); } + delete m; + }), + access_(access) { + CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T))); + } + + // As above, but now with read/write access as a default + explicit Buffer(const Context &context, const size_t size): + Buffer<T>(context, BufferAccess::kReadWrite, size) { + } + + // Constructs a new buffer based on an existing host-container + template <typename Iterator> + explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end): + Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) { + auto size = static_cast<size_t>(end - start); + auto pointer = &*start; + CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue())); + queue.Finish(); + } + + // Copies from device to host: reading the device buffer a-synchronously + void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { + if (access_ == BufferAccess::kWriteOnly) { + throw LogicError("Buffer: reading from a write-only buffer"); + } + CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue())); + } + void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host, + const size_t offset = 0) const { + if (host.size() < size) { + throw LogicError("Buffer: target host buffer is too small"); + } + ReadAsync(queue, size, host.data(), offset); + } + void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host, + const size_t offset = 0) const { + if (host.size() < size) { + throw LogicError("Buffer: target host buffer is too small"); + } + ReadAsync(queue, size, host.data(), offset); + } + + // Copies from device to host: reading the device buffer + void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { + ReadAsync(queue, size, host, offset); + queue.Finish(); + } + void Read(const Queue &queue, const size_t size, std::vector<T> &host, + const size_t offset = 0) const { + Read(queue, size, host.data(), offset); + } + void Read(const Queue &queue, const size_t size, BufferHost<T> &host, + const size_t offset = 0) const { + Read(queue, size, host.data(), offset); + } + + // Copies from host to device: writing the device buffer a-synchronously + void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + if (access_ == BufferAccess::kReadOnly) { + throw LogicError("Buffer: writing to a read-only buffer"); + } + if (GetSize() < (offset+size)*sizeof(T)) { + throw LogicError("Buffer: target device buffer is too small"); + } + CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue())); + } + void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host, + const size_t offset = 0) { + WriteAsync(queue, size, host.data(), offset); + } + void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host, + const size_t offset = 0) { + WriteAsync(queue, size, host.data(), offset); + } + + // Copies from host to device: writing the device buffer + void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { + WriteAsync(queue, size, host, offset); + queue.Finish(); + } + void Write(const Queue &queue, const size_t size, const std::vector<T> &host, + const size_t offset = 0) { + Write(queue, size, host.data(), offset); + } + void Write(const Queue &queue, const size_t size, const BufferHost<T> &host, + const size_t offset = 0) { + Write(queue, size, host.data(), offset); + } + + // Copies the contents of this buffer into another device buffer + void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const { + CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue())); + } + void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const { + CopyToAsync(queue, size, destination); + queue.Finish(); + } + + // Retrieves the actual allocated size in bytes + size_t GetSize() const { + auto result = size_t{0}; + CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_)); + return result; + } + + // Accessors to the private data-members + CUdeviceptr operator()() const { return *buffer_; } + CUdeviceptr& operator()() { return *buffer_; } +private: + std::shared_ptr<CUdeviceptr> buffer_; + const BufferAccess access_; +}; + +// ================================================================================================= + +// C++11 version of 'CUfunction' +class Kernel { +public: + + // Constructor based on the regular CUDA data-type: memory management is handled elsewhere + explicit Kernel(const CUfunction kernel): + name_("unknown"), + kernel_(kernel) { + } + + // Regular constructor with memory management + explicit Kernel(const Program &program, const std::string &name): name_(name) { + CheckError(cuModuleGetFunction(&kernel_, program.GetModule(), name.c_str())); + } + + // Sets a kernel argument at the indicated position. This stores both the value of the argument + // (as raw bytes) and the index indicating where this value can be found. + template <typename T> + void SetArgument(const size_t index, const T &value) { + if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); } + arguments_indices_[index] = arguments_data_.size(); + for (auto j=size_t(0); j<sizeof(T); ++j) { + arguments_data_.push_back(reinterpret_cast<const char*>(&value)[j]); + } + } + template <typename T> + void SetArgument(const size_t index, Buffer<T> &value) { + SetArgument(index, value()); + } + + // Sets all arguments in one go using parameter packs. Note that this resets all previously set + // arguments using 'SetArgument' or 'SetArguments'. + template <typename... Args> + void SetArguments(Args&... args) { + arguments_indices_.clear(); + arguments_data_.clear(); + SetArgumentsRecursive(0, args...); + } + + // Retrieves the amount of local memory used per work-group for this kernel. Note that this the + // shared memory in CUDA terminology. + unsigned long LocalMemUsage(const Device &) const { + auto result = 0; + CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_)); + return static_cast<unsigned long>(result); + } + + // Retrieves the name of the kernel + std::string GetFunctionName() const { + return name_; + } + + // Launches a kernel onto the specified queue + void Launch(const Queue &queue, const std::vector<size_t> &global, + const std::vector<size_t> &local, EventPointer event) { + // TODO: Currently this CUDA launch is always synchronous due to a cuStreamSynchronize call + if (local.size() == 0) { + throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end"); + } + + // Creates the grid (number of threadblocks) and sets the block sizes (threads per block) + auto grid = std::vector<size_t>{1, 1, 1}; + auto block = std::vector<size_t>{1, 1, 1}; + if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); } + for (auto i=size_t{0}; i<local.size(); ++i) { grid[i] = global[i]/local[i]; } + for (auto i=size_t{0}; i<local.size(); ++i) { block[i] = local[i]; } + + // Creates the array of pointers from the arrays of indices & data + std::vector<void*> pointers; + for (auto &index: arguments_indices_) { + pointers.push_back(&arguments_data_[index]); + } + + // Launches the kernel, its execution time is recorded by events + if (event) { CheckError(cuEventRecord(event->start(), queue())); } + CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2], + 0, queue(), pointers.data(), nullptr)); + cuStreamSynchronize(queue()); + if (event) { CheckError(cuEventRecord(event->end(), queue())); } + } + + // As above, but with an event waiting list + void Launch(const Queue &queue, const std::vector<size_t> &global, + const std::vector<size_t> &local, EventPointer event, + const std::vector<Event>& waitForEvents) { + for (auto &waitEvent : waitForEvents) { + waitEvent.WaitForCompletion(); // note: doesn't do anything, every kernel call is synchronous + } + return Launch(queue, global, local, event); + } + + // Accessors to the private data-members + const CUfunction& operator()() const { return kernel_; } + CUfunction operator()() { return kernel_; } +private: + const std::string name_; + CUfunction kernel_; + std::vector<size_t> arguments_indices_; // Indices of the arguments + std::vector<char> arguments_data_; // The arguments data as raw bytes + + // Internal implementation for the recursive SetArguments function. + template <typename T> + void SetArgumentsRecursive(const size_t index, T &first) { + SetArgument(index, first); + } + template <typename T, typename... Args> + void SetArgumentsRecursive(const size_t index, T &first, Args&... args) { + SetArgument(index, first); + SetArgumentsRecursive(index+1, args...); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_CUPP11_H_ +#endif diff --git a/src/cxpp11_common.hpp b/src/cxpp11_common.hpp index 6ac008be..5097eac4 100644 --- a/src/cxpp11_common.hpp +++ b/src/cxpp11_common.hpp @@ -15,6 +15,7 @@ #ifndef CLBLAST_CXPP11_COMMON_H_ #define CLBLAST_CXPP11_COMMON_H_ +#include <cstring> // strchr #include <string> // std::string #include <stdexcept> // std::runtime_error diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 9481881e..01c411bc 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -24,14 +24,16 @@ R"( // ================================================================================================= -// Enable support for double-precision -#if PRECISION == 16 - #pragma OPENCL EXTENSION cl_khr_fp16: enable -#endif +#ifndef CUDA + // Enable support for double-precision + #if PRECISION == 16 + #pragma OPENCL EXTENSION cl_khr_fp16: enable + #endif -// Enable support for double-precision -#if PRECISION == 64 || PRECISION == 6464 - #pragma OPENCL EXTENSION cl_khr_fp64: enable + // Enable support for double-precision + #if PRECISION == 64 || PRECISION == 6464 + #pragma OPENCL EXTENSION cl_khr_fp64: enable + #endif #endif // Half-precision @@ -117,10 +119,15 @@ R"( #define GetRealArg(x) x #endif +// Pointers to local memory objects (using a define because CUDA doesn't need them) +#ifndef LOCAL_PTR + #define LOCAL_PTR __local +#endif + // ================================================================================================= // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific -// devices, this is enabled (see src/routine.cc). +// devices, this is enabled (see src/routine.cpp). #ifndef USE_CL_MAD #define USE_CL_MAD 0 #endif @@ -254,18 +261,18 @@ R"( // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf // More details: https://github.com/CNugteren/CLBlast/issues/53 #if USE_STAGGERED_INDICES == 1 - INLINE_FUNC size_t GetGroupIDFlat() { + INLINE_FUNC int GetGroupIDFlat() { return get_group_id(0) + get_num_groups(0) * get_group_id(1); } - INLINE_FUNC size_t GetGroupID1() { + INLINE_FUNC int GetGroupID1() { return (GetGroupIDFlat()) % get_num_groups(1); } - INLINE_FUNC size_t GetGroupID0() { + INLINE_FUNC int GetGroupID0() { return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0); } #else - INLINE_FUNC size_t GetGroupID1() { return get_group_id(1); } - INLINE_FUNC size_t GetGroupID0() { return get_group_id(0); } + INLINE_FUNC int GetGroupID1() { return get_group_id(1); } + INLINE_FUNC int GetGroupID0() { return get_group_id(0); } #endif // ================================================================================================= diff --git a/src/kernels/level2/level2.opencl b/src/kernels/level2/level2.opencl index 505231ca..ff46c2a5 100644 --- a/src/kernels/level2/level2.opencl +++ b/src/kernels/level2/level2.opencl @@ -34,7 +34,7 @@ R"( // Returns an element from a vector INLINE_FUNC real LoadVector(const int id, const int max, - __global real* gm, const int offset, const int inc, + const __global real* gm, const int offset, const int inc, const int do_conjugate) { if (id < max) { real result = gm[id*inc + offset]; diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl index 93241700..281fdcff 100644 --- a/src/kernels/level3/invert_diagonal_blocks.opencl +++ b/src/kernels/level3/invert_diagonal_blocks.opencl @@ -164,7 +164,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src // ================================================================================================= // Triple matrix-multiplication kernel: C = A * B -INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n, +INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, LOCAL_PTR real* blm, int n, __global const real* agm, __global const real* bgm, __global real* cgm, const int lda, const int ldb, const int ldc, int current_size, int num_pages, const int block_size) { @@ -250,7 +250,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, // ================================================================================================= // Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower) -INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n, +INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR real* blm, int n, __global const real* src, const int a_offset, const int lda, __global real* dest, int current_size, int num_pages, const int block_size) { @@ -286,7 +286,7 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, __local rea } // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower) -INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n, +INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n, __global real* dest, int current_size, int num_pages, const int block_size) { // Emulates a 3D grid: NX * (NY * num_pages) diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl index 70156d3a..37b25d99 100644 --- a/src/kernels/level3/transpose_fast.opencl +++ b/src/kernels/level3/transpose_fast.opencl @@ -84,39 +84,39 @@ void TransposeMatrixFast(const int ld, #if TRA_WPT == 1 results[0] = v[0]; #elif TRA_WPT == 2 - results[0] = (realT) {v[0].x, v[1].x}; - results[1] = (realT) {v[0].y, v[1].y}; + results[0].x = v[0].x; results[0].y = v[1].x; + results[1].x = v[0].y; results[1].y = v[1].y; #elif TRA_WPT == 4 - results[0] = (realT) {v[0].x, v[1].x, v[2].x, v[3].x}; - results[1] = (realT) {v[0].y, v[1].y, v[2].y, v[3].y}; - results[2] = (realT) {v[0].z, v[1].z, v[2].z, v[3].z}; - results[3] = (realT) {v[0].w, v[1].w, v[2].w, v[3].w}; + results[0].x = v[0].x; results[0].y = v[1].x; results[0].z = v[2].x; results[0].w = v[3].x; + results[1].x = v[0].y; results[1].y = v[1].y; results[1].z = v[2].y; results[1].w = v[3].y; + results[2].x = v[0].z; results[2].y = v[1].z; results[2].z = v[2].z; results[2].w = v[3].z; + results[3].x = v[0].w; results[3].y = v[1].w; results[3].z = v[2].w; results[3].w = v[3].w; #elif TRA_WPT == 8 - results[0] = (realT) {v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0}; - results[1] = (realT) {v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1}; - results[2] = (realT) {v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2}; - results[3] = (realT) {v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3}; - results[4] = (realT) {v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4}; - results[5] = (realT) {v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5}; - results[6] = (realT) {v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6}; - results[7] = (realT) {v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7}; + results[0].s0 = v[0].s0; results[0].s1 = v[1].s0; results[0].s2 = v[2].s0; results[0].s3 = v[3].s0; results[0].s4 = v[4].s0; results[0].s5 = v[5].s0; results[0].s6 = v[6].s0; results[0].s7 = v[7].s0; + results[1].s0 = v[0].s1; results[1].s1 = v[1].s1; results[1].s2 = v[2].s1; results[1].s3 = v[3].s1; results[1].s4 = v[4].s1; results[1].s5 = v[5].s1; results[1].s6 = v[6].s1; results[1].s7 = v[7].s1; + results[2].s0 = v[0].s2; results[2].s1 = v[1].s2; results[2].s2 = v[2].s2; results[2].s3 = v[3].s2; results[2].s4 = v[4].s2; results[2].s5 = v[5].s2; results[2].s6 = v[6].s2; results[2].s7 = v[7].s2; + results[3].s0 = v[0].s3; results[3].s1 = v[1].s3; results[3].s2 = v[2].s3; results[3].s3 = v[3].s3; results[3].s4 = v[4].s3; results[3].s5 = v[5].s3; results[3].s6 = v[6].s3; results[3].s7 = v[7].s3; + results[4].s0 = v[0].s4; results[4].s1 = v[1].s4; results[4].s2 = v[2].s4; results[4].s3 = v[3].s4; results[4].s4 = v[4].s4; results[4].s5 = v[5].s4; results[4].s6 = v[6].s4; results[4].s7 = v[7].s4; + results[5].s0 = v[0].s5; results[5].s1 = v[1].s5; results[5].s2 = v[2].s5; results[5].s3 = v[3].s5; results[5].s4 = v[4].s5; results[5].s5 = v[5].s5; results[5].s6 = v[6].s5; results[5].s7 = v[7].s5; + results[6].s0 = v[0].s6; results[6].s1 = v[1].s6; results[6].s2 = v[2].s6; results[6].s3 = v[3].s6; results[6].s4 = v[4].s6; results[6].s5 = v[5].s6; results[6].s6 = v[6].s6; results[6].s7 = v[7].s6; + results[7].s0 = v[0].s7; results[7].s1 = v[1].s7; results[7].s2 = v[2].s7; results[7].s3 = v[3].s7; results[7].s4 = v[4].s7; results[7].s5 = v[5].s7; results[7].s6 = v[6].s7; results[7].s7 = v[7].s7; #elif TRA_WPT == 16 - results[ 0] = (realT) {v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0, v[8].s0, v[9].s0, v[10].s0, v[11].s0, v[12].s0, v[13].s0, v[14].s0, v[15].s0}; - results[ 1] = (realT) {v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1, v[8].s1, v[9].s1, v[10].s1, v[11].s1, v[12].s1, v[13].s1, v[14].s1, v[15].s1}; - results[ 2] = (realT) {v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2, v[8].s2, v[9].s2, v[10].s2, v[11].s2, v[12].s2, v[13].s2, v[14].s2, v[15].s2}; - results[ 3] = (realT) {v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3, v[8].s3, v[9].s3, v[10].s3, v[11].s3, v[12].s3, v[13].s3, v[14].s3, v[15].s3}; - results[ 4] = (realT) {v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4, v[8].s4, v[9].s4, v[10].s4, v[11].s4, v[12].s4, v[13].s4, v[14].s4, v[15].s4}; - results[ 5] = (realT) {v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5, v[8].s5, v[9].s5, v[10].s5, v[11].s5, v[12].s5, v[13].s5, v[14].s5, v[15].s5}; - results[ 6] = (realT) {v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6, v[8].s6, v[9].s6, v[10].s6, v[11].s6, v[12].s6, v[13].s6, v[14].s6, v[15].s6}; - results[ 7] = (realT) {v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7, v[8].s7, v[9].s7, v[10].s7, v[11].s7, v[12].s7, v[13].s7, v[14].s7, v[15].s7}; - results[ 8] = (realT) {v[0].s8, v[1].s8, v[2].s8, v[3].s8, v[4].s8, v[5].s8, v[6].s8, v[7].s8, v[8].s8, v[9].s8, v[10].s8, v[11].s8, v[12].s8, v[13].s8, v[14].s8, v[15].s8}; - results[ 9] = (realT) {v[0].s9, v[1].s9, v[2].s9, v[3].s9, v[4].s9, v[5].s9, v[6].s9, v[7].s9, v[8].s9, v[9].s9, v[10].s9, v[11].s9, v[12].s9, v[13].s9, v[14].s9, v[15].s9}; - results[10] = (realT) {v[0].sA, v[1].sA, v[2].sA, v[3].sA, v[4].sA, v[5].sA, v[6].sA, v[7].sA, v[8].sA, v[9].sA, v[10].sA, v[11].sA, v[12].sA, v[13].sA, v[14].sA, v[15].sA}; - results[11] = (realT) {v[0].sB, v[1].sB, v[2].sB, v[3].sB, v[4].sB, v[5].sB, v[6].sB, v[7].sB, v[8].sB, v[9].sB, v[10].sB, v[11].sB, v[12].sB, v[13].sB, v[14].sB, v[15].sB}; - results[12] = (realT) {v[0].sC, v[1].sC, v[2].sC, v[3].sC, v[4].sC, v[5].sC, v[6].sC, v[7].sC, v[8].sC, v[9].sC, v[10].sC, v[11].sC, v[12].sC, v[13].sC, v[14].sC, v[15].sC}; - results[13] = (realT) {v[0].sD, v[1].sD, v[2].sD, v[3].sD, v[4].sD, v[5].sD, v[6].sD, v[7].sD, v[8].sD, v[9].sD, v[10].sD, v[11].sD, v[12].sD, v[13].sD, v[14].sD, v[15].sD}; - results[14] = (realT) {v[0].sE, v[1].sE, v[2].sE, v[3].sE, v[4].sE, v[5].sE, v[6].sE, v[7].sE, v[8].sE, v[9].sE, v[10].sE, v[11].sE, v[12].sE, v[13].sE, v[14].sE, v[15].sE}; - results[15] = (realT) {v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF}; + results[ 0].s0 = v[0].s0; results[ 0].s1 = v[1].s0; results[ 0].s2 = v[2].s0; results[ 0].s3 = v[3].s0; results[ 0].s4 = v[4].s0; results[ 0].s5 = v[5].s0; results[ 0].s6 = v[6].s0; results[ 0].s7 = v[7].s0; results[ 0].s8 = v[8].s0; results[ 0].s9 = v[9].s0; results[ 0].sA = v[10].s0; results[ 0].sB = v[11].s0; results[ 0].sC = v[12].s0; results[ 0].sD = v[13].s0; results[ 0].sE = v[14].s0; results[ 0].sF = v[15].s0; + results[ 1].s0 = v[0].s1; results[ 1].s1 = v[1].s1; results[ 1].s2 = v[2].s1; results[ 1].s3 = v[3].s1; results[ 1].s4 = v[4].s1; results[ 1].s5 = v[5].s1; results[ 1].s6 = v[6].s1; results[ 1].s7 = v[7].s1; results[ 1].s8 = v[8].s1; results[ 1].s9 = v[9].s1; results[ 1].sA = v[10].s1; results[ 1].sB = v[11].s1; results[ 1].sC = v[12].s1; results[ 1].sD = v[13].s1; results[ 1].sE = v[14].s1; results[ 1].sF = v[15].s1; + results[ 2].s0 = v[0].s2; results[ 2].s1 = v[1].s2; results[ 2].s2 = v[2].s2; results[ 2].s3 = v[3].s2; results[ 2].s4 = v[4].s2; results[ 2].s5 = v[5].s2; results[ 2].s6 = v[6].s2; results[ 2].s7 = v[7].s2; results[ 2].s8 = v[8].s2; results[ 2].s9 = v[9].s2; results[ 2].sA = v[10].s2; results[ 2].sB = v[11].s2; results[ 2].sC = v[12].s2; results[ 2].sD = v[13].s2; results[ 2].sE = v[14].s2; results[ 2].sF = v[15].s2; + results[ 3].s0 = v[0].s3; results[ 3].s1 = v[1].s3; results[ 3].s2 = v[2].s3; results[ 3].s3 = v[3].s3; results[ 3].s4 = v[4].s3; results[ 3].s5 = v[5].s3; results[ 3].s6 = v[6].s3; results[ 3].s7 = v[7].s3; results[ 3].s8 = v[8].s3; results[ 3].s9 = v[9].s3; results[ 3].sA = v[10].s3; results[ 3].sB = v[11].s3; results[ 3].sC = v[12].s3; results[ 3].sD = v[13].s3; results[ 3].sE = v[14].s3; results[ 3].sF = v[15].s3; + results[ 4].s0 = v[0].s4; results[ 4].s1 = v[1].s4; results[ 4].s2 = v[2].s4; results[ 4].s3 = v[3].s4; results[ 4].s4 = v[4].s4; results[ 4].s5 = v[5].s4; results[ 4].s6 = v[6].s4; results[ 4].s7 = v[7].s4; results[ 4].s8 = v[8].s4; results[ 4].s9 = v[9].s4; results[ 4].sA = v[10].s4; results[ 4].sB = v[11].s4; results[ 4].sC = v[12].s4; results[ 4].sD = v[13].s4; results[ 4].sE = v[14].s4; results[ 4].sF = v[15].s4; + results[ 5].s0 = v[0].s5; results[ 5].s1 = v[1].s5; results[ 5].s2 = v[2].s5; results[ 5].s3 = v[3].s5; results[ 5].s4 = v[4].s5; results[ 5].s5 = v[5].s5; results[ 5].s6 = v[6].s5; results[ 5].s7 = v[7].s5; results[ 5].s8 = v[8].s5; results[ 5].s9 = v[9].s5; results[ 5].sA = v[10].s5; results[ 5].sB = v[11].s5; results[ 5].sC = v[12].s5; results[ 5].sD = v[13].s5; results[ 5].sE = v[14].s5; results[ 5].sF = v[15].s5; + results[ 6].s0 = v[0].s6; results[ 6].s1 = v[1].s6; results[ 6].s2 = v[2].s6; results[ 6].s3 = v[3].s6; results[ 6].s4 = v[4].s6; results[ 6].s5 = v[5].s6; results[ 6].s6 = v[6].s6; results[ 6].s7 = v[7].s6; results[ 6].s8 = v[8].s6; results[ 6].s9 = v[9].s6; results[ 6].sA = v[10].s6; results[ 6].sB = v[11].s6; results[ 6].sC = v[12].s6; results[ 6].sD = v[13].s6; results[ 6].sE = v[14].s6; results[ 6].sF = v[15].s6; + results[ 7].s0 = v[0].s7; results[ 7].s1 = v[1].s7; results[ 7].s2 = v[2].s7; results[ 7].s3 = v[3].s7; results[ 7].s4 = v[4].s7; results[ 7].s5 = v[5].s7; results[ 7].s6 = v[6].s7; results[ 7].s7 = v[7].s7; results[ 7].s8 = v[8].s7; results[ 7].s9 = v[9].s7; results[ 7].sA = v[10].s7; results[ 7].sB = v[11].s7; results[ 7].sC = v[12].s7; results[ 7].sD = v[13].s7; results[ 7].sE = v[14].s7; results[ 7].sF = v[15].s7; + results[ 8].s0 = v[0].s8; results[ 8].s1 = v[1].s8; results[ 8].s2 = v[2].s8; results[ 8].s3 = v[3].s8; results[ 8].s4 = v[4].s8; results[ 8].s5 = v[5].s8; results[ 8].s6 = v[6].s8; results[ 8].s7 = v[7].s8; results[ 8].s8 = v[8].s8; results[ 8].s9 = v[9].s8; results[ 8].sA = v[10].s8; results[ 8].sB = v[11].s8; results[ 8].sC = v[12].s8; results[ 8].sD = v[13].s8; results[ 8].sE = v[14].s8; results[ 8].sF = v[15].s8; + results[ 9].s0 = v[0].s9; results[ 9].s1 = v[1].s9; results[ 9].s2 = v[2].s9; results[ 9].s3 = v[3].s9; results[ 9].s4 = v[4].s9; results[ 9].s5 = v[5].s9; results[ 9].s6 = v[6].s9; results[ 9].s7 = v[7].s9; results[ 9].s8 = v[8].s9; results[ 9].s9 = v[9].s9; results[ 9].sA = v[10].s9; results[ 9].sB = v[11].s9; results[ 9].sC = v[12].s9; results[ 9].sD = v[13].s9; results[ 9].sE = v[14].s9; results[ 9].sF = v[15].s9; + results[10].s0 = v[0].sA; results[10].s1 = v[1].sA; results[10].s2 = v[2].sA; results[10].s3 = v[3].sA; results[10].s4 = v[4].sA; results[10].s5 = v[5].sA; results[10].s6 = v[6].sA; results[10].s7 = v[7].sA; results[10].s8 = v[8].sA; results[10].s9 = v[9].sA; results[10].sA = v[10].sA; results[10].sB = v[11].sA; results[10].sC = v[12].sA; results[10].sD = v[13].sA; results[10].sE = v[14].sA; results[10].sF = v[15].sA; + results[11].s0 = v[0].sB; results[11].s1 = v[1].sB; results[11].s2 = v[2].sB; results[11].s3 = v[3].sB; results[11].s4 = v[4].sB; results[11].s5 = v[5].sB; results[11].s6 = v[6].sB; results[11].s7 = v[7].sB; results[11].s8 = v[8].sB; results[11].s9 = v[9].sB; results[11].sA = v[10].sB; results[11].sB = v[11].sB; results[11].sC = v[12].sB; results[11].sD = v[13].sB; results[11].sE = v[14].sB; results[11].sF = v[15].sB; + results[12].s0 = v[0].sC; results[12].s1 = v[1].sC; results[12].s2 = v[2].sC; results[12].s3 = v[3].sC; results[12].s4 = v[4].sC; results[12].s5 = v[5].sC; results[12].s6 = v[6].sC; results[12].s7 = v[7].sC; results[12].s8 = v[8].sC; results[12].s9 = v[9].sC; results[12].sA = v[10].sC; results[12].sB = v[11].sC; results[12].sC = v[12].sC; results[12].sD = v[13].sC; results[12].sE = v[14].sC; results[12].sF = v[15].sC; + results[13].s0 = v[0].sD; results[13].s1 = v[1].sD; results[13].s2 = v[2].sD; results[13].s3 = v[3].sD; results[13].s4 = v[4].sD; results[13].s5 = v[5].sD; results[13].s6 = v[6].sD; results[13].s7 = v[7].sD; results[13].s8 = v[8].sD; results[13].s9 = v[9].sD; results[13].sA = v[10].sD; results[13].sB = v[11].sD; results[13].sC = v[12].sD; results[13].sD = v[13].sD; results[13].sE = v[14].sD; results[13].sF = v[15].sD; + results[14].s0 = v[0].sE; results[14].s1 = v[1].sE; results[14].s2 = v[2].sE; results[14].s3 = v[3].sE; results[14].s4 = v[4].sE; results[14].s5 = v[5].sE; results[14].s6 = v[6].sE; results[14].s7 = v[7].sE; results[14].s8 = v[8].sE; results[14].s9 = v[9].sE; results[14].sA = v[10].sE; results[14].sB = v[11].sE; results[14].sC = v[12].sE; results[14].sD = v[13].sE; results[14].sE = v[14].sE; results[14].sF = v[15].sE; + results[15].s0 = v[0].sF; results[15].s1 = v[1].sF; results[15].s2 = v[2].sF; results[15].s3 = v[3].sF; results[15].s4 = v[4].sF; results[15].s5 = v[5].sF; results[15].s6 = v[6].sF; results[15].s7 = v[7].sF; results[15].s8 = v[8].sF; results[15].s9 = v[9].sF; results[15].sA = v[10].sF; results[15].sB = v[11].sF; results[15].sC = v[12].sF; results[15].sD = v[13].sF; results[15].sE = v[14].sF; results[15].sF = v[15].sF; #endif // Multiplies by alpha and then stores the results into the destination matrix diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index 49c5b9a3..ba9a6a56 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -24,7 +24,7 @@ R"( // Transposes a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the transposed source matrix dimensions. -INLINE_FUNC void _TransposePadMatrix(__local real* tile, +INLINE_FUNC void _TransposePadMatrix(LOCAL_PTR real* tile, const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, @@ -105,7 +105,7 @@ void TransposePadMatrix(const int src_one, const int src_two, // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a // padded source matrix, but only the actual data is written back to the transposed destination // matrix. This kernel optionally checks for upper/lower triangular matrices. -INLINE_FUNC void _TransposeMatrix(__local real* tile, +INLINE_FUNC void _TransposeMatrix(LOCAL_PTR real* tile, const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, diff --git a/src/kernels/level3/xgemm_direct_batched.opencl b/src/kernels/level3/xgemm_direct_batched.opencl index fa582cff..d946a056 100644 --- a/src/kernels/level3/xgemm_direct_batched.opencl +++ b/src/kernels/level3/xgemm_direct_batched.opencl @@ -19,8 +19,8 @@ R"( // ================================================================================================= // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed] -__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) -__kernel void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, +__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld, @@ -40,8 +40,8 @@ __kernel void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int } // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed] -__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) -__kernel void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, +__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld, @@ -61,8 +61,8 @@ __kernel void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int } // Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed] -__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) -__kernel void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, +__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld, @@ -82,8 +82,8 @@ __kernel void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int } // Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed] -__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) -__kernel void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, +__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld, diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl index 8b650589..7d185224 100644 --- a/src/kernels/level3/xgemm_direct_part1.opencl +++ b/src/kernels/level3/xgemm_direct_part1.opencl @@ -184,7 +184,7 @@ INLINE_FUNC void GlobalToPrivateCheckedB(const __global real* restrict bgms, rea // Caches on-chip local memory into per-thread private memory (registers). This function is specific // for caching the A input matrix. -INLINE_FUNC void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg, +INLINE_FUNC void LocalToPrivateDirectA(LOCAL_PTR real* alm, real apm[MWID], const int kg, const int a_transpose) { #pragma unroll for (int mi=0; mi<MWID; ++mi) { @@ -195,7 +195,7 @@ INLINE_FUNC void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const } // Same as above, but now for the B input matrix -INLINE_FUNC void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg, +INLINE_FUNC void LocalToPrivateDirectB(LOCAL_PTR real* blm, real bpm[NWID], const int kg, const int b_transpose) { #pragma unroll for (int ni=0; ni<NWID; ++ni) { diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl index 1d9330fc..c3bf1b80 100644 --- a/src/kernels/level3/xgemm_direct_part2.opencl +++ b/src/kernels/level3/xgemm_direct_part2.opencl @@ -19,7 +19,7 @@ R"( // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. -INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, +INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, LOCAL_PTR real* alm, const int a_ld, const int a_offset, const int kwg, const int a_transpose, const int a_conjugate) { #if MDIMCD == MDIMAD @@ -90,7 +90,7 @@ INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __loc } // Same as above, but now for the B input matrix -INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm, +INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, LOCAL_PTR real* blm, const int b_ld, const int b_offset, const int kwg, const int b_transpose, const int b_conjugate) { #if MDIMCD == NDIMBD @@ -165,7 +165,7 @@ INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __loc // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. In contrast to the functions above, this function performs doesn't // use the vector data-types. -INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm, +INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, LOCAL_PTR real* alm, const int a_ld, const int a_offset, const int kwg, const int a_transpose, const int a_conjugate) { #if MDIMCD == MDIMAD @@ -196,7 +196,7 @@ INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __loca } // Same as above, but now for the B input matrix -INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm, +INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, LOCAL_PTR real* blm, const int b_ld, const int b_offset, const int kwg, const int b_transpose, const int b_conjugate) { #if MDIMCD == NDIMBD @@ -231,7 +231,7 @@ INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __loca // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. In contrast to the functions above, this function performs bounds // checks and doesn't use the vector data-types. -INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm, +INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, LOCAL_PTR real* alm, const int a_ld, const int a_offset, const int kwg, const int a_transpose, const int a_conjugate, const int kSizeM, const int kSizeK) { @@ -270,7 +270,7 @@ INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __loc } // Same as above, but now for the B input matrix -INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm, +INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, LOCAL_PTR real* blm, const int b_ld, const int b_offset, const int kwg, const int b_transpose, const int b_conjugate, const int kSizeN, const int kSizeK) { diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl index b0beb614..5862dfa3 100644 --- a/src/kernels/level3/xgemm_direct_part3.opencl +++ b/src/kernels/level3/xgemm_direct_part3.opencl @@ -24,7 +24,7 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize const __global realMD* restrict agm, const int a_offset, const int a_ld, const __global realND* restrict bgm, const int b_offset, const int b_ld, __global real* cgm, const int c_offset, const int c_ld, - __local real* alm, __local real* blm, + LOCAL_PTR real* alm, LOCAL_PTR real* blm, const int a_transpose, const int b_transpose, const int c_transpose, const int a_conjugate, const int b_conjugate) { const real alpha = GetRealArg(arg_alpha); @@ -147,8 +147,8 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize // ================================================================================================= // Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed] -__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) -__kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK, +__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const __global realND* restrict bgm, const int b_offset, const int b_ld, @@ -162,8 +162,8 @@ __kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK } // Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed] -__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) -__kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK, +__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const __global realND* restrict bgm, const int b_offset, const int b_ld, @@ -177,8 +177,8 @@ __kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK } // Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed] -__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) -__kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK, +__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const __global realND* restrict bgm, const int b_offset, const int b_ld, @@ -192,8 +192,8 @@ __kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK } // Direct version of the GEMM kernel with [A, B] = [transposed, transposed] -__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) -__kernel void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK, +__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const __global realND* restrict bgm, const int b_offset, const int b_ld, diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 07dafe13..172b3c6b 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -186,7 +186,7 @@ INLINE_FUNC void InitAccRegisters(realM cpm[NWI][MWI/VWM]) { // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. #if SA == 1 -INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm, +INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, LOCAL_PTR realM* alm, const int kSizeM, const int tid, const int kwg) { const int la0 = tid % MDIMA; const int la1 = tid / MDIMA; @@ -216,7 +216,7 @@ INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, __local real // Same as above, but now for the B input matrix #if SB == 1 -INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm, +INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, LOCAL_PTR realN* blm, const int kSizeN, const int tid, const int kwg) { const int lb0 = tid % NDIMB; const int lb1 = tid / NDIMB; @@ -298,7 +298,7 @@ INLINE_FUNC void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[ // Caches on-chip local memory into per-thread private memory (registers). This function is specific // for caching the A input matrix. #if SA == 1 -INLINE_FUNC void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) { +INLINE_FUNC void LocalToPrivateA(LOCAL_PTR realM* alm, realM apm[MWI/VWM], const int kg) { #pragma unroll for (int mi=0; mi<MWI/VWM; ++mi) { #if STRM == 0 @@ -313,7 +313,7 @@ INLINE_FUNC void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const i // Same as above, but now for the B input matrix #if SB == 1 -INLINE_FUNC void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) { +INLINE_FUNC void LocalToPrivateB(LOCAL_PTR realN* blm, realN bpm[NWI/VWN], const int kg) { #pragma unroll for (int ni=0; ni<NWI/VWN; ++ni) { #if STRN == 0 diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index f447677f..ce24907c 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -22,11 +22,11 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm, realM cpm[NWI][MWI/VWM] #if SA == 1 && SB == 1 - , __local realM* alm, __local realN* blm + , LOCAL_PTR realM* alm, LOCAL_PTR realN* blm #elif SA == 1 - , __local realM* alm + , LOCAL_PTR realM* alm #elif SB == 1 - , __local realN* blm + , LOCAL_PTR realN* blm #endif ) { diff --git a/src/kernels/opencl_to_cuda.h b/src/kernels/opencl_to_cuda.h new file mode 100644 index 00000000..5682a456 --- /dev/null +++ b/src/kernels/opencl_to_cuda.h @@ -0,0 +1,90 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains an (incomplete) header to interpret OpenCL kernels as CUDA kernels. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( +// ================================================================================================= + +// CLBlast specific additions +#define CUDA 1 +#define LOCAL_PTR // pointers to local memory don't have to be annotated in CUDA + +// Replaces the OpenCL get_xxx_ID with CUDA equivalents +__device__ int get_local_id(const int x) { + if (x == 0) { return threadIdx.x; } + if (x == 1) { return threadIdx.y; } + return threadIdx.z; +} +__device__ int get_group_id(const int x) { + if (x == 0) { return blockIdx.x; } + if (x == 1) { return blockIdx.y; } + return blockIdx.z; +} +__device__ int get_local_size(const int x) { + if (x == 0) { return blockDim.x; } + if (x == 1) { return blockDim.y; } + return blockDim.z; +} +__device__ int get_num_groups(const int x) { + if (x == 0) { return gridDim.x; } + if (x == 1) { return gridDim.y; } + return gridDim.z; +} +__device__ int get_global_size(const int x) { + if (x == 0) { return gridDim.x * blockDim.x; } + if (x == 1) { return gridDim.y * blockDim.y; } + return gridDim.z * blockDim.z; +} +__device__ int get_global_id(const int x) { + if (x == 0) { return blockIdx.x*blockDim.x + threadIdx.x; } + if (x == 1) { return blockIdx.y*blockDim.y + threadIdx.y; } + return blockIdx.z*blockDim.z + threadIdx.z; +} + +// Adds the data-types which are not available natively under CUDA +typedef struct { float s0; float s1; float s2; float s3; + float s4; float s5; float s6; float s7; } float8; +typedef struct { float s0; float s1; float s2; float s3; + float s4; float s5; float s6; float s7; + float s8; float s9; float s10; float s11; + float s12; float s13; float s14; float s15; } float16; +typedef struct { double s0; double s1; double s2; double s3; + double s4; double s5; double s6; double s7; } double8; +typedef struct { double s0; double s1; double s2; double s3; + double s4; double s5; double s6; double s7; + double s8; double s9; double s10; double s11; + double s12; double s13; double s14; double s15; } double16; + +// Replaces the OpenCL keywords with CUDA equivalent +#define __kernel __placeholder__ +#define __global +#define __placeholder__ extern "C" __global__ +#define __local __shared__ +#define restrict __restrict__ +#define __constant const +#define inline __device__ // assumes all device functions are annotated with inline in OpenCL + +// Kernel attributes (don't replace currently) +#define reqd_work_group_size(x, y, z) + +// Replaces OpenCL synchronisation with CUDA synchronisation +#define barrier(x) __syncthreads() + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= + diff --git a/src/routine.cpp b/src/routine.cpp index aaa85fde..0f9fe360 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -167,6 +167,13 @@ void Routine::InitProgram(std::initializer_list<const char *> source) { source_string += "#define GLOBAL_MEM_FENCE 1\n"; } + // Optionally adds a translation header from OpenCL kernels to CUDA kernels + #ifdef CUDA_API + source_string += + #include "kernels/opencl_to_cuda.h" + ; + #endif + // Loads the common header (typedefs and defines and such) source_string += #include "kernels/common.opencl" diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 84ccd9d2..bf3b1762 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -19,8 +19,7 @@ #include <string> #include <vector> -#include "clpp11.hpp" -#include "clblast.h" +#include "utilities/utilities.hpp" #include "database/database.hpp" namespace clblast { diff --git a/src/routines/levelx/xaxpybatched.cpp b/src/routines/levelx/xaxpybatched.cpp index 0b755ccf..52c27b78 100644 --- a/src/routines/levelx/xaxpybatched.cpp +++ b/src/routines/levelx/xaxpybatched.cpp @@ -59,9 +59,9 @@ void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas x_offsets_int[batch] = static_cast<int>(x_offsets[batch]); y_offsets_int[batch] = static_cast<int>(y_offsets[batch]); } - auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count); + auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto alphas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count); x_offsets_device.Write(queue_, batch_count, x_offsets_int); y_offsets_device.Write(queue_, batch_count, y_offsets_int); alphas_device.Write(queue_, batch_count, alphas); diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp index 4e9f0004..8a015e97 100644 --- a/src/routines/levelx/xgemmbatched.cpp +++ b/src/routines/levelx/xgemmbatched.cpp @@ -100,8 +100,8 @@ void XgemmBatched<T>::DoGemmBatched(const Layout layout, const Transpose a_trans } // Upload the scalar arguments to the device - auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count); - auto betas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count); + auto alphas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count); + auto betas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count); alphas_device.Write(queue_, batch_count, alphas); betas_device.Write(queue_, batch_count, betas); @@ -200,8 +200,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { - auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto a_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto a_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); a_offsets_device.Write(queue_, batch_count, a_offsets); a_offsets_i_device.Write(queue_, batch_count, a_offsets_i); auto eventProcessA = Event(); @@ -214,8 +214,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const // As above, but now for matrix B if (!b_no_temp) { - auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto b_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto b_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); b_offsets_device.Write(queue_, batch_count, b_offsets); b_offsets_i_device.Write(queue_, batch_count, b_offsets_i); auto eventProcessB = Event(); @@ -227,8 +227,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const } // As above, but now for matrix C - auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto c_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto c_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); if (!c_no_temp) { c_offsets_device.Write(queue_, batch_count, c_offsets); c_offsets_i_device.Write(queue_, batch_count, c_offsets_i); @@ -297,9 +297,9 @@ void XgemmBatched<T>::BatchedGemmDirect(const size_t m, const size_t n, const si const size_t batch_count) { // Uploads the offsets to the device - auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); a_offsets_device.Write(queue_, batch_count, a_offsets); b_offsets_device.Write(queue_, batch_count, b_offsets); c_offsets_device.Write(queue_, batch_count, c_offsets); diff --git a/src/routines/routines.hpp b/src/routines/routines.hpp new file mode 100644 index 00000000..9e7768b9 --- /dev/null +++ b/src/routines/routines.hpp @@ -0,0 +1,76 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains all the includes of all the routines in CLBlast. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_ROUTINES_H_ +#define CLBLAST_ROUTINES_ROUTINES_H_ + +// BLAS level-1 includes +#include "routines/level1/xswap.hpp" +#include "routines/level1/xscal.hpp" +#include "routines/level1/xcopy.hpp" +#include "routines/level1/xaxpy.hpp" +#include "routines/level1/xdot.hpp" +#include "routines/level1/xdotu.hpp" +#include "routines/level1/xdotc.hpp" +#include "routines/level1/xnrm2.hpp" +#include "routines/level1/xasum.hpp" +#include "routines/level1/xsum.hpp" // non-BLAS routine +#include "routines/level1/xamax.hpp" +#include "routines/level1/xamin.hpp" // non-BLAS routine +#include "routines/level1/xmax.hpp" // non-BLAS routine +#include "routines/level1/xmin.hpp" // non-BLAS routine + +// BLAS level-2 includes +#include "routines/level2/xgemv.hpp" +#include "routines/level2/xgbmv.hpp" +#include "routines/level2/xhemv.hpp" +#include "routines/level2/xhbmv.hpp" +#include "routines/level2/xhpmv.hpp" +#include "routines/level2/xsymv.hpp" +#include "routines/level2/xsbmv.hpp" +#include "routines/level2/xspmv.hpp" +#include "routines/level2/xtrmv.hpp" +#include "routines/level2/xtbmv.hpp" +#include "routines/level2/xtpmv.hpp" +#include "routines/level2/xtrsv.hpp" +#include "routines/level2/xger.hpp" +#include "routines/level2/xgeru.hpp" +#include "routines/level2/xgerc.hpp" +#include "routines/level2/xher.hpp" +#include "routines/level2/xhpr.hpp" +#include "routines/level2/xher2.hpp" +#include "routines/level2/xhpr2.hpp" +#include "routines/level2/xsyr.hpp" +#include "routines/level2/xspr.hpp" +#include "routines/level2/xsyr2.hpp" +#include "routines/level2/xspr2.hpp" + +// BLAS level-3 includes +#include "routines/level3/xgemm.hpp" +#include "routines/level3/xsymm.hpp" +#include "routines/level3/xhemm.hpp" +#include "routines/level3/xsyrk.hpp" +#include "routines/level3/xherk.hpp" +#include "routines/level3/xsyr2k.hpp" +#include "routines/level3/xher2k.hpp" +#include "routines/level3/xtrmm.hpp" +#include "routines/level3/xtrsm.hpp" + +// Level-x includes (non-BLAS) +#include "routines/levelx/xomatcopy.hpp" +#include "routines/levelx/xim2col.hpp" +#include "routines/levelx/xaxpybatched.hpp" +#include "routines/levelx/xgemmbatched.hpp" + +// CLBLAST_ROUTINES_ROUTINES_H_ +#endif diff --git a/src/utilities/buffer_test.hpp b/src/utilities/buffer_test.hpp index b5693181..fd071434 100644 --- a/src/utilities/buffer_test.hpp +++ b/src/utilities/buffer_test.hpp @@ -15,7 +15,7 @@ #ifndef CLBLAST_BUFFER_TEST_H_ #define CLBLAST_BUFFER_TEST_H_ -#include "clblast.h" +#include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= diff --git a/src/utilities/clblast_exceptions.hpp b/src/utilities/clblast_exceptions.hpp index 0d0033b6..a790be9c 100644 --- a/src/utilities/clblast_exceptions.hpp +++ b/src/utilities/clblast_exceptions.hpp @@ -16,8 +16,7 @@ #ifndef CLBLAST_EXCEPTIONS_H_ #define CLBLAST_EXCEPTIONS_H_ -#include "clpp11.hpp" -#include "clblast.h" +#include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp index a5c1d45e..f2574104 100644 --- a/src/utilities/utilities.cpp +++ b/src/utilities/utilities.cpp @@ -413,13 +413,17 @@ std::string GetDeviceVendor(const Device& device) { // Mid-level info std::string GetDeviceArchitecture(const Device& device) { auto device_architecture = std::string{""}; - if (device.HasExtension(kKhronosAttributesNVIDIA)) { + #ifdef CUDA_API device_architecture = device.NVIDIAComputeCapability(); - } - else if (device.HasExtension(kKhronosAttributesAMD)) { - device_architecture = device.Name(); // Name is architecture for AMD APP and AMD ROCm - } - // Note: no else - 'device_architecture' might be the empty string + #else + if (device.HasExtension(kKhronosAttributesNVIDIA)) { + device_architecture = device.NVIDIAComputeCapability(); + } + else if (device.HasExtension(kKhronosAttributesAMD)) { + device_architecture = device.Name(); // Name is architecture for AMD APP and AMD ROCm + } + // Note: no else - 'device_architecture' might be the empty string + #endif for (auto &find_and_replace : device_mapping::kArchitectureNames) { // replacing to common names if (device_architecture == find_and_replace.first) { device_architecture = find_and_replace.second; } diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index b2949c27..f56226be 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -21,8 +21,13 @@ #include <complex> #include <random> -#include "clpp11.hpp" -#include "clblast.h" +#ifdef OPENCL_API + #include "clpp11.hpp" + #include "clblast.h" +#elif CUDA_API + #include "cupp11.hpp" + #include "clblast_cuda.h" +#endif #include "clblast_half.h" #include "utilities/clblast_exceptions.hpp" #include "utilities/msvc.hpp" |