summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-10-11 23:16:57 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2017-10-11 23:16:57 +0200
commitb901809345848b44442c787380b13db5e5156df0 (patch)
tree0f14f5e38c08b604a96304abda427fe6ce3f64d6 /src
parent9224da19ef384c1a7986587a682035905f63cf55 (diff)
Added first (untested) version of a CUDA API
Diffstat (limited to 'src')
-rw-r--r--src/api_common.cpp2
-rw-r--r--src/clblast_cuda.cpp2336
-rw-r--r--src/cupp11.hpp770
-rw-r--r--src/utilities/buffer_test.hpp2
-rw-r--r--src/utilities/utilities.hpp9
5 files changed, 3115 insertions, 4 deletions
diff --git a/src/api_common.cpp b/src/api_common.cpp
index aa7e2b0f..0d387cd9 100644
--- a/src/api_common.cpp
+++ b/src/api_common.cpp
@@ -12,9 +12,9 @@
#include <string>
+#include "utilities/utilities.hpp"
#include "cache.hpp"
#include "routines/routines.hpp"
-#include "clblast.h"
namespace clblast {
// =================================================================================================
diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp
new file mode 100644
index 00000000..5f30d023
--- /dev/null
+++ b/src/clblast_cuda.cpp
@@ -0,0 +1,2336 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements all the BLAS API calls (CUDA version). In all cases, it does not much more
+// than creating a new object of the appropriate type, and calling the main routine on that object.
+// It forwards all status codes to the caller.
+//
+// =================================================================================================
+
+#include <string>
+
+#include "routines/routines.hpp"
+#include "clblast_cuda.h"
+
+namespace clblast {
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// Generate givens plane rotation: SROTG/DROTG
+template <typename T>
+StatusCode Rotg(CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream* stream) {
+ return StatusCode::kNotImplemented;
+}
+template StatusCode PUBLIC_API Rotg<float>(CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Rotg<double>(CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+
+// Generate modified givens plane rotation: SROTMG/DROTMG
+template <typename T>
+StatusCode Rotmg(CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream* stream) {
+ return StatusCode::kNotImplemented;
+}
+template StatusCode PUBLIC_API Rotmg<float>(CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Rotmg<double>(CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+
+// Apply givens plane rotation: SROT/DROT
+template <typename T>
+StatusCode Rot(const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ const T,
+ const T,
+ CUstream* stream) {
+ return StatusCode::kNotImplemented;
+}
+template StatusCode PUBLIC_API Rot<float>(const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ const float,
+ const float,
+ CUstream*);
+template StatusCode PUBLIC_API Rot<double>(const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ const double,
+ const double,
+ CUstream*);
+
+// Apply modified givens plane rotation: SROTM/DROTM
+template <typename T>
+StatusCode Rotm(const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream* stream) {
+ return StatusCode::kNotImplemented;
+}
+template StatusCode PUBLIC_API Rotm<float>(const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Rotm<double>(const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+
+// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
+template <typename T>
+StatusCode Swap(const size_t n,
+ CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xswap<T>(queue_cpp, event);
+ routine.DoSwap(n,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Swap<float>(const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Swap<double>(const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Swap<float2>(const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Swap<double2>(const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Swap<half>(const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
+template <typename T>
+StatusCode Scal(const size_t n,
+ const T alpha,
+ CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xscal<T>(queue_cpp, event);
+ routine.DoScal(n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Scal<float>(const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Scal<double>(const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Scal<float2>(const size_t,
+ const float2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Scal<double2>(const size_t,
+ const double2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Scal<half>(const size_t,
+ const half,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
+template <typename T>
+StatusCode Copy(const size_t n,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xcopy<T>(queue_cpp, event);
+ routine.DoCopy(n,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Copy<float>(const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Copy<double>(const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Copy<float2>(const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Copy<double2>(const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Copy<half>(const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
+template <typename T>
+StatusCode Axpy(const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xaxpy<T>(queue_cpp, event);
+ routine.DoAxpy(n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Axpy<float>(const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Axpy<double>(const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Axpy<float2>(const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Axpy<double2>(const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Axpy<half>(const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Dot product of two vectors: SDOT/DDOT/HDOT
+template <typename T>
+StatusCode Dot(const size_t n,
+ CUdeviceptr dot_buffer, const size_t dot_offset,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xdot<T>(queue_cpp, event);
+ routine.DoDot(n,
+ Buffer<T>(dot_buffer), dot_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Dot<float>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Dot<double>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Dot<half>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Dot product of two complex vectors: CDOTU/ZDOTU
+template <typename T>
+StatusCode Dotu(const size_t n,
+ CUdeviceptr dot_buffer, const size_t dot_offset,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xdotu<T>(queue_cpp, event);
+ routine.DoDotu(n,
+ Buffer<T>(dot_buffer), dot_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Dotu<float2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Dotu<double2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
+template <typename T>
+StatusCode Dotc(const size_t n,
+ CUdeviceptr dot_buffer, const size_t dot_offset,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xdotc<T>(queue_cpp, event);
+ routine.DoDotc(n,
+ Buffer<T>(dot_buffer), dot_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Dotc<float2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Dotc<double2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
+template <typename T>
+StatusCode Nrm2(const size_t n,
+ CUdeviceptr nrm2_buffer, const size_t nrm2_offset,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xnrm2<T>(queue_cpp, event);
+ routine.DoNrm2(n,
+ Buffer<T>(nrm2_buffer), nrm2_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Nrm2<float>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Nrm2<double>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Nrm2<float2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Nrm2<double2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Nrm2<half>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
+template <typename T>
+StatusCode Asum(const size_t n,
+ CUdeviceptr asum_buffer, const size_t asum_offset,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xasum<T>(queue_cpp, event);
+ routine.DoAsum(n,
+ Buffer<T>(asum_buffer), asum_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Asum<float>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Asum<double>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Asum<float2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Asum<double2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Asum<half>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
+template <typename T>
+StatusCode Sum(const size_t n,
+ CUdeviceptr sum_buffer, const size_t sum_offset,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsum<T>(queue_cpp, event);
+ routine.DoSum(n,
+ Buffer<T>(sum_buffer), sum_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Sum<float>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Sum<double>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Sum<float2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Sum<double2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Sum<half>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
+template <typename T>
+StatusCode Amax(const size_t n,
+ CUdeviceptr imax_buffer, const size_t imax_offset,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xamax<T>(queue_cpp, event);
+ routine.DoAmax(n,
+ Buffer<unsigned int>(imax_buffer), imax_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Amax<float>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Amax<double>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Amax<float2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Amax<double2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Amax<half>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN
+template <typename T>
+StatusCode Amin(const size_t n,
+ CUdeviceptr imin_buffer, const size_t imin_offset,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xamin<T>(queue_cpp, event);
+ routine.DoAmin(n,
+ Buffer<unsigned int>(imin_buffer), imin_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Amin<float>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Amin<double>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Amin<float2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Amin<double2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Amin<half>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
+template <typename T>
+StatusCode Max(const size_t n,
+ CUdeviceptr imax_buffer, const size_t imax_offset,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xmax<T>(queue_cpp, event);
+ routine.DoMax(n,
+ Buffer<unsigned int>(imax_buffer), imax_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Max<float>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Max<double>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Max<float2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Max<double2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Max<half>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
+template <typename T>
+StatusCode Min(const size_t n,
+ CUdeviceptr imin_buffer, const size_t imin_offset,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xmin<T>(queue_cpp, event);
+ routine.DoMin(n,
+ Buffer<unsigned int>(imin_buffer), imin_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Min<float>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Min<double>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Min<float2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Min<double2>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Min<half>(const size_t,
+ CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
+template <typename T>
+StatusCode Gemv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xgemv<T>(queue_cpp, event);
+ routine.DoGemv(layout, a_transpose,
+ m, n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Gemv<float>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gemv<double>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gemv<float2>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gemv<double2>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gemv<half>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const half,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
+template <typename T>
+StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xgbmv<T>(queue_cpp, event);
+ routine.DoGbmv(layout, a_transpose,
+ m, n, kl, ku,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Gbmv<float>(const Layout, const Transpose,
+ const size_t, const size_t, const size_t, const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gbmv<double>(const Layout, const Transpose,
+ const size_t, const size_t, const size_t, const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gbmv<float2>(const Layout, const Transpose,
+ const size_t, const size_t, const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gbmv<double2>(const Layout, const Transpose,
+ const size_t, const size_t, const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gbmv<half>(const Layout, const Transpose,
+ const size_t, const size_t, const size_t, const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const half,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
+template <typename T>
+StatusCode Hemv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhemv<T>(queue_cpp, event);
+ routine.DoHemv(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Hemv<float2>(const Layout, const Triangle,
+ const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Hemv<double2>(const Layout, const Triangle,
+ const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
+template <typename T>
+StatusCode Hbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhbmv<T>(queue_cpp, event);
+ routine.DoHbmv(layout, triangle,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Hbmv<float2>(const Layout, const Triangle,
+ const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Hbmv<double2>(const Layout, const Triangle,
+ const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
+template <typename T>
+StatusCode Hpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const CUdeviceptr ap_buffer, const size_t ap_offset,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhpmv<T>(queue_cpp, event);
+ routine.DoHpmv(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Hpmv<float2>(const Layout, const Triangle,
+ const size_t,
+ const float2,
+ const CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Hpmv<double2>(const Layout, const Triangle,
+ const size_t,
+ const double2,
+ const CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
+template <typename T>
+StatusCode Symv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsymv<T>(queue_cpp, event);
+ routine.DoSymv(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Symv<float>(const Layout, const Triangle,
+ const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Symv<double>(const Layout, const Triangle,
+ const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Symv<half>(const Layout, const Triangle,
+ const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const half,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
+template <typename T>
+StatusCode Sbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsbmv<T>(queue_cpp, event);
+ routine.DoSbmv(layout, triangle,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Sbmv<float>(const Layout, const Triangle,
+ const size_t, const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Sbmv<double>(const Layout, const Triangle,
+ const size_t, const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Sbmv<half>(const Layout, const Triangle,
+ const size_t, const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const half,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
+template <typename T>
+StatusCode Spmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const CUdeviceptr ap_buffer, const size_t ap_offset,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xspmv<T>(queue_cpp, event);
+ routine.DoSpmv(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Spmv<float>(const Layout, const Triangle,
+ const size_t,
+ const float,
+ const CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Spmv<double>(const Layout, const Triangle,
+ const size_t,
+ const double,
+ const CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Spmv<half>(const Layout, const Triangle,
+ const size_t,
+ const half,
+ const CUdeviceptr, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const half,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
+template <typename T>
+StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xtrmv<T>(queue_cpp, event);
+ routine.DoTrmv(layout, triangle, a_transpose, diagonal,
+ n,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trmv<double>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trmv<float2>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trmv<double2>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trmv<half>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
+template <typename T>
+StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xtbmv<T>(queue_cpp, event);
+ routine.DoTbmv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tbmv<double>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tbmv<float2>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tbmv<double2>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tbmv<half>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
+template <typename T>
+StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const CUdeviceptr ap_buffer, const size_t ap_offset,
+ CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xtpmv<T>(queue_cpp, event);
+ routine.DoTpmv(layout, triangle, a_transpose, diagonal,
+ n,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tpmv<double>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tpmv<float2>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tpmv<double2>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
+template <typename T>
+StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xtrsv<T>(queue_cpp, event);
+ routine.DoTrsv(layout, triangle, a_transpose, diagonal,
+ n,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Trsv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trsv<double>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trsv<float2>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trsv<double2>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
+template <typename T>
+StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream* stream) {
+ return StatusCode::kNotImplemented;
+}
+template StatusCode PUBLIC_API Tbsv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tbsv<double>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tbsv<float2>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tbsv<double2>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
+template <typename T>
+StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream* stream) {
+ return StatusCode::kNotImplemented;
+}
+template StatusCode PUBLIC_API Tpsv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tpsv<double>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tpsv<float2>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Tpsv<double2>(const Layout, const Triangle, const Transpose, const Diagonal,
+ const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// General rank-1 matrix update: SGER/DGER/HGER
+template <typename T>
+StatusCode Ger(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xger<T>(queue_cpp, event);
+ routine.DoGer(layout,
+ m, n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Ger<float>(const Layout,
+ const size_t, const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Ger<double>(const Layout,
+ const size_t, const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Ger<half>(const Layout,
+ const size_t, const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// General rank-1 complex matrix update: CGERU/ZGERU
+template <typename T>
+StatusCode Geru(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xgeru<T>(queue_cpp, event);
+ routine.DoGeru(layout,
+ m, n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Geru<float2>(const Layout,
+ const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Geru<double2>(const Layout,
+ const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// General rank-1 complex conjugated matrix update: CGERC/ZGERC
+template <typename T>
+StatusCode Gerc(const Layout layout,
+ const size_t m, const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xgerc<T>(queue_cpp, event);
+ routine.DoGerc(layout,
+ m, n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Gerc<float2>(const Layout,
+ const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gerc<double2>(const Layout,
+ const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Hermitian rank-1 matrix update: CHER/ZHER
+template <typename T>
+StatusCode Her(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xher<std::complex<T>,T>(queue_cpp, event);
+ routine.DoHer(layout, triangle,
+ n,
+ alpha,
+ Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
+ Buffer<std::complex<T>>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Her<float>(const Layout, const Triangle,
+ const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Her<double>(const Layout, const Triangle,
+ const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Hermitian packed rank-1 matrix update: CHPR/ZHPR
+template <typename T>
+StatusCode Hpr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUdeviceptr ap_buffer, const size_t ap_offset,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event);
+ routine.DoHpr(layout, triangle,
+ n,
+ alpha,
+ Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
+ Buffer<std::complex<T>>(ap_buffer), ap_offset);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Hpr<float>(const Layout, const Triangle,
+ const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Hpr<double>(const Layout, const Triangle,
+ const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+
+// Hermitian rank-2 matrix update: CHER2/ZHER2
+template <typename T>
+StatusCode Her2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xher2<T>(queue_cpp, event);
+ routine.DoHer2(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Her2<float2>(const Layout, const Triangle,
+ const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Her2<double2>(const Layout, const Triangle,
+ const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
+template <typename T>
+StatusCode Hpr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUdeviceptr ap_buffer, const size_t ap_offset,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhpr2<T>(queue_cpp, event);
+ routine.DoHpr2(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(ap_buffer), ap_offset);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Hpr2<float2>(const Layout, const Triangle,
+ const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Hpr2<double2>(const Layout, const Triangle,
+ const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
+template <typename T>
+StatusCode Syr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsyr<T>(queue_cpp, event);
+ routine.DoSyr(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Syr<float>(const Layout, const Triangle,
+ const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Syr<double>(const Layout, const Triangle,
+ const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Syr<half>(const Layout, const Triangle,
+ const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
+template <typename T>
+StatusCode Spr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ CUdeviceptr ap_buffer, const size_t ap_offset,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xspr<T>(queue_cpp, event);
+ routine.DoSpr(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(ap_buffer), ap_offset);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Spr<float>(const Layout, const Triangle,
+ const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Spr<double>(const Layout, const Triangle,
+ const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Spr<half>(const Layout, const Triangle,
+ const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
+template <typename T>
+StatusCode Syr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsyr2<T>(queue_cpp, event);
+ routine.DoSyr2(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(a_buffer), a_offset, a_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Syr2<float>(const Layout, const Triangle,
+ const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Syr2<double>(const Layout, const Triangle,
+ const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Syr2<half>(const Layout, const Triangle,
+ const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
+template <typename T>
+StatusCode Spr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc,
+ const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc,
+ CUdeviceptr ap_buffer, const size_t ap_offset,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xspr2<T>(queue_cpp, event);
+ routine.DoSpr2(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc,
+ Buffer<T>(ap_buffer), ap_offset);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Spr2<float>(const Layout, const Triangle,
+ const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Spr2<double>(const Layout, const Triangle,
+ const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Spr2<half>(const Layout, const Triangle,
+ const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
+template <typename T>
+StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xgemm<T>(queue_cpp, event);
+ routine.DoGemm(layout, a_transpose, b_transpose,
+ m, n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld,
+ beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Gemm<float>(const Layout, const Transpose, const Transpose,
+ const size_t, const size_t, const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gemm<double>(const Layout, const Transpose, const Transpose,
+ const size_t, const size_t, const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gemm<float2>(const Layout, const Transpose, const Transpose,
+ const size_t, const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gemm<double2>(const Layout, const Transpose, const Transpose,
+ const size_t, const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Gemm<half>(const Layout, const Transpose, const Transpose,
+ const size_t, const size_t, const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const half,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
+template <typename T>
+StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
+ const size_t m, const size_t n,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsymm<T>(queue_cpp, event);
+ routine.DoSymm(layout, side, triangle,
+ m, n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld,
+ beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Symm<float>(const Layout, const Side, const Triangle,
+ const size_t, const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Symm<double>(const Layout, const Side, const Triangle,
+ const size_t, const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Symm<float2>(const Layout, const Side, const Triangle,
+ const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Symm<double2>(const Layout, const Side, const Triangle,
+ const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Symm<half>(const Layout, const Side, const Triangle,
+ const size_t, const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const half,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
+template <typename T>
+StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
+ const size_t m, const size_t n,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xhemm<T>(queue_cpp, event);
+ routine.DoHemm(layout, side, triangle,
+ m, n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld,
+ beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Hemm<float2>(const Layout, const Side, const Triangle,
+ const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Hemm<double2>(const Layout, const Side, const Triangle,
+ const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
+template <typename T>
+StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const T beta,
+ CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsyrk<T>(queue_cpp, event);
+ routine.DoSyrk(layout, triangle, a_transpose,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Syrk<float>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Syrk<double>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Syrk<float2>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const float2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Syrk<double2>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const double2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Syrk<half>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ const half,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Rank-K update of a hermitian matrix: CHERK/ZHERK
+template <typename T>
+StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const T beta,
+ CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xherk<std::complex<T>,T>(queue_cpp, event);
+ routine.DoHerk(layout, triangle, a_transpose,
+ n, k,
+ alpha,
+ Buffer<std::complex<T>>(a_buffer), a_offset, a_ld,
+ beta,
+ Buffer<std::complex<T>>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Herk<float>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Herk<double>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
+template <typename T>
+StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xsyr2k<T>(queue_cpp, event);
+ routine.DoSyr2k(layout, triangle, ab_transpose,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld,
+ beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Syr2k<float>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Syr2k<double>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Syr2k<float2>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Syr2k<double2>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double2,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Syr2k<half>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const half,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
+template <typename T, typename U>
+StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+ const size_t n, const size_t k,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+ const U beta,
+ CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xher2k<T,U>(queue_cpp, event);
+ routine.DoHer2k(layout, triangle, ab_transpose,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld,
+ beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Her2k<float2,float>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const float,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Her2k<double2,double>(const Layout, const Triangle, const Transpose,
+ const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ const CUdeviceptr, const size_t, const size_t,
+ const double,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
+template <typename T>
+StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t m, const size_t n,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xtrmm<T>(queue_cpp, event);
+ routine.DoTrmm(layout, side, triangle, a_transpose, diagonal,
+ m, n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Trmm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trmm<double>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trmm<float2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trmm<double2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
+template <typename T>
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t m, const size_t n,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xtrsm<T>(queue_cpp, event);
+ routine.DoTrsm(layout, side, triangle, a_transpose, diagonal,
+ m, n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Trsm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trsm<double>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trsm<float2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
+ const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+template <typename T>
+StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld,
+ CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xomatcopy<T>(queue_cpp, event);
+ routine.DoOmatcopy(layout, a_transpose,
+ m, n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Omatcopy<float>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const float,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Omatcopy<double>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const double,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Omatcopy<float2>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const float2,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Omatcopy<double2>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const double2,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const half,
+ const CUdeviceptr, const size_t, const size_t,
+ CUdeviceptr, const size_t, const size_t,
+ CUstream*);
+
+// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL
+template <typename T>
+StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+ const CUdeviceptr im_buffer, const size_t im_offset,
+ CUdeviceptr col_buffer, const size_t col_offset,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xim2col<T>(queue_cpp, event);
+ routine.DoIm2col(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+ Buffer<T>(im_buffer), im_offset,
+ Buffer<T>(col_buffer), col_offset);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Im2col<float>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Im2col<double>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Im2col<float2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Im2col<double2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API Im2col<half>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ CUstream*);
+
+// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
+template <typename T>
+StatusCode AxpyBatched(const size_t n,
+ const T *alphas,
+ const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc,
+ CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc,
+ const size_t batch_count,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = XaxpyBatched<T>(queue_cpp, event);
+ auto alphas_cpp = std::vector<T>();
+ auto x_offsets_cpp = std::vector<size_t>();
+ auto y_offsets_cpp = std::vector<size_t>();
+ for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+ alphas_cpp.push_back(alphas[batch]);
+ x_offsets_cpp.push_back(x_offsets[batch]);
+ y_offsets_cpp.push_back(y_offsets[batch]);
+ }
+ routine.DoAxpyBatched(n,
+ alphas_cpp,
+ Buffer<T>(x_buffer), x_offsets_cpp, x_inc,
+ Buffer<T>(y_buffer), y_offsets_cpp, y_inc,
+ batch_count);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API AxpyBatched<float>(const size_t,
+ const float*,
+ const CUdeviceptr, const size_t*, const size_t,
+ CUdeviceptr, const size_t*, const size_t,
+ const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API AxpyBatched<double>(const size_t,
+ const double*,
+ const CUdeviceptr, const size_t*, const size_t,
+ CUdeviceptr, const size_t*, const size_t,
+ const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t,
+ const float2*,
+ const CUdeviceptr, const size_t*, const size_t,
+ CUdeviceptr, const size_t*, const size_t,
+ const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t,
+ const double2*,
+ const CUdeviceptr, const size_t*, const size_t,
+ CUdeviceptr, const size_t*, const size_t,
+ const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API AxpyBatched<half>(const size_t,
+ const half*,
+ const CUdeviceptr, const size_t*, const size_t,
+ CUdeviceptr, const size_t*, const size_t,
+ const size_t,
+ CUstream*);
+
+// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
+template <typename T>
+StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const T *alphas,
+ const CUdeviceptr a_buffer, const size_t *a_offsets, const size_t a_ld,
+ const CUdeviceptr b_buffer, const size_t *b_offsets, const size_t b_ld,
+ const T *betas,
+ CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld,
+ const size_t batch_count,
+ CUstream* stream) {
+ try {
+ auto queue_cpp = Queue(*queue);
+ auto routine = XgemmBatched<T>(queue_cpp, event);
+ auto alphas_cpp = std::vector<T>();
+ auto betas_cpp = std::vector<T>();
+ auto a_offsets_cpp = std::vector<size_t>();
+ auto b_offsets_cpp = std::vector<size_t>();
+ auto c_offsets_cpp = std::vector<size_t>();
+ for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+ alphas_cpp.push_back(alphas[batch]);
+ betas_cpp.push_back(betas[batch]);
+ a_offsets_cpp.push_back(a_offsets[batch]);
+ b_offsets_cpp.push_back(b_offsets[batch]);
+ c_offsets_cpp.push_back(c_offsets[batch]);
+ }
+ routine.DoGemmBatched(layout, a_transpose, b_transpose,
+ m, n, k,
+ alphas_cpp,
+ Buffer<T>(a_buffer), a_offsets_cpp, a_ld,
+ Buffer<T>(b_buffer), b_offsets_cpp, b_ld,
+ betas_cpp,
+ Buffer<T>(c_buffer), c_offsets_cpp, c_ld,
+ batch_count);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API GemmBatched<float>(const Layout, const Transpose, const Transpose,
+ const size_t, const size_t, const size_t,
+ const float*,
+ const CUdeviceptr, const size_t*, const size_t,
+ const CUdeviceptr, const size_t*, const size_t,
+ const float*,
+ CUdeviceptr, const size_t*, const size_t,
+ const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API GemmBatched<double>(const Layout, const Transpose, const Transpose,
+ const size_t, const size_t, const size_t,
+ const double*,
+ const CUdeviceptr, const size_t*, const size_t,
+ const CUdeviceptr, const size_t*, const size_t,
+ const double*,
+ CUdeviceptr, const size_t*, const size_t,
+ const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API GemmBatched<float2>(const Layout, const Transpose, const Transpose,
+ const size_t, const size_t, const size_t,
+ const float2*,
+ const CUdeviceptr, const size_t*, const size_t,
+ const CUdeviceptr, const size_t*, const size_t,
+ const float2*,
+ CUdeviceptr, const size_t*, const size_t,
+ const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API GemmBatched<double2>(const Layout, const Transpose, const Transpose,
+ const size_t, const size_t, const size_t,
+ const double2*,
+ const CUdeviceptr, const size_t*, const size_t,
+ const CUdeviceptr, const size_t*, const size_t,
+ const double2*,
+ CUdeviceptr, const size_t*, const size_t,
+ const size_t,
+ CUstream*);
+template StatusCode PUBLIC_API GemmBatched<half>(const Layout, const Transpose, const Transpose,
+ const size_t, const size_t, const size_t,
+ const half*,
+ const CUdeviceptr, const size_t*, const size_t,
+ const CUdeviceptr, const size_t*, const size_t,
+ const half*,
+ CUdeviceptr, const size_t*, const size_t,
+ const size_t,
+ CUstream*);
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/cupp11.hpp b/src/cupp11.hpp
new file mode 100644
index 00000000..988366ea
--- /dev/null
+++ b/src/cupp11.hpp
@@ -0,0 +1,770 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a bunch of C++11 classes that act as wrappers around OpenCL objects and API
+// calls. The main benefits are increased abstraction, automatic memory management, and portability.
+// Portability here means that a similar header exists for CUDA with the same classes and
+// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
+//
+// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
+// therefore contains the following header copyright notice:
+//
+// =================================================================================================
+//
+// Copyright 2015 SURFsara
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CUPP11_H_
+#define CLBLAST_CUPP11_H_
+
+// C++
+#include <algorithm> // std::copy
+#include <string> // std::string
+#include <vector> // std::vector
+#include <memory> // std::shared_ptr
+
+// CUDA
+#include <cuda.h> // CUDA driver API
+#include <nvrtc.h> // NVIDIA runtime compilation API
+
+// Exception classes
+#include "cxpp11_common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Max-length of strings
+constexpr auto kStringLength = 256;
+
+// =================================================================================================
+
+// Represents a runtime error returned by a CUDA driver API function
+class CLCudaAPIError : public ErrorCode<DeviceError, CUresult> {
+public:
+ explicit CLCudaAPIError(CUresult status, const std::string &where):
+ ErrorCode(status, where, "CUDA error: " + where + ": " +
+ GetErrorName(status) + " --> " + GetErrorString(status)) {
+ }
+
+ static void Check(const CUresult status, const std::string &where) {
+ if (status != CUDA_SUCCESS) {
+ throw CLCudaAPIError(status, where);
+ }
+ }
+
+ static void CheckDtor(const CUresult status, const std::string &where) {
+ if (status != CUDA_SUCCESS) {
+ fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPIError(status, where).what());
+ }
+ }
+
+private:
+ std::string GetErrorName(CUresult status) const {
+ const char* status_code;
+ cuGetErrorName(status, &status_code);
+ return std::string(status_code);
+ }
+ std::string GetErrorString(CUresult status) const {
+ const char* status_string;
+ cuGetErrorString(status, &status_string);
+ return std::string(status_string);
+ }
+};
+
+// Represents a runtime error returned by a CUDA runtime compilation API function
+class CLCudaAPINVRTCError : public ErrorCode<DeviceError, nvrtcResult> {
+public:
+ explicit CLCudaAPINVRTCError(nvrtcResult status, const std::string &where):
+ ErrorCode(status, where, "CUDA NVRTC error: " + where + ": " + GetErrorString(status)) {
+ }
+
+ static void Check(const nvrtcResult status, const std::string &where) {
+ if (status != NVRTC_SUCCESS) {
+ throw CLCudaAPINVRTCError(status, where);
+ }
+ }
+
+ static void CheckDtor(const nvrtcResult status, const std::string &where) {
+ if (status != NVRTC_SUCCESS) {
+ fprintf(stderr, "CLCudaAPI: %s (ignoring)\n", CLCudaAPINVRTCError(status, where).what());
+ }
+ }
+
+private:
+ std::string GetErrorString(nvrtcResult status) const {
+ const char* status_string = nvrtcGetErrorString(status);
+ return std::string(status_string);
+ }
+};
+
+// Exception returned when building a program
+using CLCudaAPIBuildError = CLCudaAPINVRTCError;
+
+// =================================================================================================
+
+// Error occurred in CUDA driver or runtime compilation API
+#define CheckError(call) CLCudaAPIError::Check(call, CLCudaAPIError::TrimCallString(#call))
+#define CheckErrorNVRTC(call) CLCudaAPINVRTCError::Check(call, CLCudaAPINVRTCError::TrimCallString(#call))
+
+// Error occurred in CUDA driver or runtime compilation API (no-exception version for destructors)
+#define CheckErrorDtor(call) CLCudaAPIError::CheckDtor(call, CLCudaAPIError::TrimCallString(#call))
+#define CheckErrorDtorNVRTC(call) CLCudaAPINVRTCError::CheckDtor(call, CLCudaAPINVRTCError::TrimCallString(#call))
+
+// =================================================================================================
+
+// C++11 version of two 'CUevent' pointers
+class Event {
+public:
+ // Note that there is no constructor based on the regular CUDA data-type because of extra state
+
+ // Regular constructor with memory management
+ explicit Event():
+ start_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }),
+ end_(new CUevent, [](CUevent* e) { CheckErrorDtor(cuEventDestroy(*e)); delete e; }) {
+ CheckError(cuEventCreate(start_.get(), CU_EVENT_DEFAULT));
+ CheckError(cuEventCreate(end_.get(), CU_EVENT_DEFAULT));
+ }
+
+ // Waits for completion of this event (not implemented for CUDA)
+ void WaitForCompletion() const { }
+
+ // Retrieves the elapsed time of the last recorded event
+ float GetElapsedTime() const {
+ auto result = 0.0f;
+ cuEventElapsedTime(&result, *start_, *end_);
+ return result;
+ }
+
+ // Accessors to the private data-members
+ const CUevent& start() const { return *start_; }
+ const CUevent& end() const { return *end_; }
+ Event* pointer() { return this; }
+private:
+ std::shared_ptr<CUevent> start_;
+ std::shared_ptr<CUevent> end_;
+};
+
+// Pointer to a CUDA event
+using EventPointer = Event*;
+
+// =================================================================================================
+
+// Raw platform ID type
+using RawPlatformID = size_t;
+
+// The CUDA platform: initializes the CUDA driver API
+class Platform {
+public:
+
+ // Initializes the platform. Note that the platform ID variable is not actually used for CUDA.
+ explicit Platform(const size_t platform_id) : platform_id_(0) {
+ if (platform_id != 0) { throw LogicError("CUDA back-end requires a platform ID of 0"); }
+ CheckError(cuInit(0));
+ }
+
+ // Methods to retrieve platform information
+ std::string Name() const { return "CUDA"; }
+ std::string Vendor() const { return "NVIDIA Corporation"; }
+ std::string Version() const {
+ auto result = 0;
+ CheckError(cuDriverGetVersion(&result));
+ return "CUDA driver "+std::to_string(result);
+ }
+
+ // Returns the number of devices on this platform
+ size_t NumDevices() const {
+ auto result = 0;
+ CheckError(cuDeviceGetCount(&result));
+ return static_cast<size_t>(result);
+ }
+
+ // Accessor to the raw ID (which doesn't exist in the CUDA back-end, this is always just 0)
+ const RawPlatformID& operator()() const { return platform_id_; }
+private:
+ const size_t platform_id_;
+};
+
+// Retrieves a vector with all platforms. Note that there is just one platform in CUDA.
+inline std::vector<Platform> GetAllPlatforms() {
+ auto all_platforms = std::vector<Platform>{ Platform(size_t{0}) };
+ return all_platforms;
+}
+
+// =================================================================================================
+
+// Raw device ID type
+using RawDeviceID = CUdevice;
+
+// C++11 version of 'CUdevice'
+class Device {
+public:
+
+ // Constructor based on the regular CUDA data-type
+ explicit Device(const CUdevice device): device_(device) { }
+
+ // Initialization
+ explicit Device(const Platform &platform, const size_t device_id) {
+ auto num_devices = platform.NumDevices();
+ if (num_devices == 0) {
+ throw RuntimeError("Device: no devices found");
+ }
+ if (device_id >= num_devices) {
+ throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
+ }
+
+ CheckError(cuDeviceGet(&device_, device_id));
+ }
+
+ // Methods to retrieve device information
+ RawPlatformID PlatformID() const { return 0; }
+ std::string Version() const {
+ auto result = 0;
+ CheckError(cuDriverGetVersion(&result));
+ return "CUDA driver "+std::to_string(result);
+ }
+ size_t VersionNumber() const {
+ auto result = 0;
+ CheckError(cuDriverGetVersion(&result));
+ return static_cast<size_t>(result);
+ }
+ std::string Vendor() const { return "NVIDIA Corporation"; }
+ std::string Name() const {
+ auto result = std::string{};
+ result.resize(kStringLength);
+ CheckError(cuDeviceGetName(&result[0], result.size(), device_));
+ return result;
+ }
+ std::string Type() const { return "GPU"; }
+ size_t MaxWorkGroupSize() const {return GetInfo(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); }
+ size_t MaxWorkItemDimensions() const { return size_t{3}; }
+ std::vector<size_t> MaxWorkItemSizes() const {
+ return std::vector<size_t>{GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X),
+ GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y),
+ GetInfo(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)};
+ }
+ unsigned long LocalMemSize() const {
+ return static_cast<unsigned long>(GetInfo(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK));
+ }
+
+ std::string Capabilities() const {
+ const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
+ const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
+ return "SM"+std::to_string(major)+"."+std::to_string(minor);
+ }
+ bool HasExtension(const std::string &extension) const { return false; }
+ bool SupportsFP64() const { return true; }
+ bool SupportsFP16() const {
+ const auto major = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
+ const auto minor = GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
+ if (major > 5) { return true; } // SM 6.x, 7.x and higher
+ if (major == 5 && minor == 3) { return true; } // SM 5.3
+ return false;
+ }
+
+ size_t CoreClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_CLOCK_RATE); }
+ size_t ComputeUnits() const { return GetInfo(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); }
+ unsigned long MemorySize() const {
+ auto result = size_t{0};
+ CheckError(cuDeviceTotalMem(&result, device_));
+ return static_cast<unsigned long>(result);
+ }
+ unsigned long MaxAllocSize() const { return MemorySize(); }
+ size_t MemoryClock() const { return 1e-3*GetInfo(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); }
+ size_t MemoryBusWidth() const { return GetInfo(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); }
+
+ // Configuration-validity checks
+ bool IsLocalMemoryValid(const size_t local_mem_usage) const {
+ return (local_mem_usage <= LocalMemSize());
+ }
+ bool IsThreadConfigValid(const std::vector<size_t> &local) const {
+ auto local_size = size_t{1};
+ for (const auto &item: local) { local_size *= item; }
+ for (auto i=size_t{0}; i<local.size(); ++i) {
+ if (local[i] > MaxWorkItemSizes()[i]) { return false; }
+ }
+ if (local_size > MaxWorkGroupSize()) { return false; }
+ if (local.size() > MaxWorkItemDimensions()) { return false; }
+ return true;
+ }
+
+ // Query for a specific type of device or brand
+ bool IsCPU() const { return false; }
+ bool IsGPU() const { return true; }
+ bool IsAMD() const { return false; }
+ bool IsNVIDIA() const { return true; }
+ bool IsIntel() const { return false; }
+ bool IsARM() const { return false; }
+
+ // Platform specific extensions
+ std::string AMDBoardName() const { return ""; }
+ std::string NVIDIAComputeCapability() const { return Capabilities(); }
+
+ // Accessor to the private data-member
+ const RawDeviceID& operator()() const { return device_; }
+private:
+ CUdevice device_;
+
+ // Private helper function
+ size_t GetInfo(const CUdevice_attribute info) const {
+ auto result = 0;
+ CheckError(cuDeviceGetAttribute(&result, info, device_));
+ return static_cast<size_t>(result);
+ }
+};
+
+// =================================================================================================
+
+// Raw context type
+using RawContext = CUcontext;
+
+// C++11 version of 'CUcontext'
+class Context {
+public:
+
+ // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
+ explicit Context(const CUcontext context):
+ context_(new CUcontext) {
+ *context_ = context;
+ }
+
+ // Regular constructor with memory management
+ explicit Context(const Device &device):
+ context_(new CUcontext, [](CUcontext* c) {
+ if (*c) { CheckErrorDtor(cuCtxDestroy(*c)); }
+ delete c;
+ }) {
+ CheckError(cuCtxCreate(context_.get(), 0, device()));
+ }
+
+ // Accessor to the private data-member
+ const RawContext& operator()() const { return *context_; }
+ RawContext* pointer() const { return &(*context_); }
+private:
+ std::shared_ptr<CUcontext> context_;
+};
+
+// Pointer to a raw CUDA context
+using ContextPointer = CUcontext*;
+
+// =================================================================================================
+
+// C++11 version of 'nvrtcProgram'. Additionally holds the program's source code.
+class Program {
+public:
+ // Note that there is no constructor based on the regular CUDA data-type because of extra state
+
+ // Source-based constructor with memory management
+ explicit Program(const Context &, std::string source):
+ program_(new nvrtcProgram, [](nvrtcProgram* p) {
+ if (*p) { CheckErrorDtorNVRTC(nvrtcDestroyProgram(p)); }
+ delete p;
+ }),
+ source_(std::move(source)),
+ from_binary_(false) {
+ const auto source_ptr = &source_[0];
+ CheckErrorNVRTC(nvrtcCreateProgram(program_.get(), source_ptr, nullptr, 0, nullptr, nullptr));
+ }
+
+ // PTX-based constructor
+ explicit Program(const Device &device, const Context &context, const std::string &binary):
+ program_(nullptr), // not used
+ source_(binary),
+ from_binary_(true) {
+ }
+
+ // Compiles the device program and checks whether or not there are any warnings/errors
+ void Build(const Device &, std::vector<std::string> &options) {
+ if (from_binary_) { return; }
+ auto raw_options = std::vector<const char*>();
+ for (const auto &option: options) {
+ raw_options.push_back(option.c_str());
+ }
+ auto status = nvrtcCompileProgram(*program_, raw_options.size(), raw_options.data());
+ CLCudaAPINVRTCError::Check(status, "nvrtcCompileProgram");
+ }
+
+ // Confirms whether a certain status code is an actual compilation error or warning
+ bool StatusIsCompilationWarningOrError(const nvrtcResult status) const {
+ return (status == NVRTC_ERROR_INVALID_INPUT);
+ }
+
+ // Retrieves the warning/error message from the compiler (if any)
+ std::string GetBuildInfo(const Device &) const {
+ if (from_binary_) { return std::string{}; }
+ auto bytes = size_t{0};
+ CheckErrorNVRTC(nvrtcGetProgramLogSize(*program_, &bytes));
+ auto result = std::string{};
+ result.resize(bytes);
+ CheckErrorNVRTC(nvrtcGetProgramLog(*program_, &result[0]));
+ return result;
+ }
+
+ // Retrieves an intermediate representation of the compiled program (i.e. PTX)
+ std::string GetIR() const {
+ if (from_binary_) { return source_; } // holds the PTX
+ auto bytes = size_t{0};
+ CheckErrorNVRTC(nvrtcGetPTXSize(*program_, &bytes));
+ auto result = std::string{};
+ result.resize(bytes);
+ CheckErrorNVRTC(nvrtcGetPTX(*program_, &result[0]));
+ return result;
+ }
+
+ // Accessor to the private data-member
+ const nvrtcProgram& operator()() const { return *program_; }
+private:
+ std::shared_ptr<nvrtcProgram> program_;
+ const std::string source_;
+ const bool from_binary_;
+};
+
+// =================================================================================================
+
+// Raw command-queue type
+using RawCommandQueue = CUstream;
+
+// C++11 version of 'CUstream'
+class Queue {
+public:
+ // Note that there is no constructor based on the regular CUDA data-type because of extra state
+
+ // Regular constructor with memory management
+ explicit Queue(const Context &context, const Device &device):
+ queue_(new CUstream, [](CUstream* s) {
+ if (*s) { CheckErrorDtor(cuStreamDestroy(*s)); }
+ delete s;
+ }),
+ context_(context),
+ device_(device) {
+ CheckError(cuStreamCreate(queue_.get(), CU_STREAM_NON_BLOCKING));
+ }
+
+ // Synchronizes the queue and optionally also an event
+ void Finish(Event &event) const {
+ CheckError(cuEventSynchronize(event.end()));
+ Finish();
+ }
+ void Finish() const {
+ CheckError(cuStreamSynchronize(*queue_));
+ }
+
+ // Retrieves the corresponding context or device
+ Context GetContext() const { return context_; }
+ Device GetDevice() const { return device_; }
+
+ // Accessor to the private data-member
+ const RawCommandQueue& operator()() const { return *queue_; }
+private:
+ std::shared_ptr<CUstream> queue_;
+ const Context context_;
+ const Device device_;
+};
+
+// =================================================================================================
+
+// C++11 version of page-locked host memory
+template <typename T>
+class BufferHost {
+public:
+
+ // Regular constructor with memory management
+ explicit BufferHost(const Context &, const size_t size):
+ buffer_(new void*, [](void** m) { CheckError(cuMemFreeHost(*m)); delete m; }),
+ size_(size) {
+ CheckError(cuMemAllocHost(buffer_.get(), size*sizeof(T)));
+ }
+
+ // Retrieves the actual allocated size in bytes
+ size_t GetSize() const {
+ return size_*sizeof(T);
+ }
+
+ // Compatibility with std::vector
+ size_t size() const { return size_; }
+ T* begin() { return &static_cast<T*>(*buffer_)[0]; }
+ T* end() { return &static_cast<T*>(*buffer_)[size_-1]; }
+ T& operator[](const size_t i) { return static_cast<T*>(*buffer_)[i]; }
+ T* data() { return static_cast<T*>(*buffer_); }
+ const T* data() const { return static_cast<T*>(*buffer_); }
+
+private:
+ std::shared_ptr<void*> buffer_;
+ const size_t size_;
+};
+
+// =================================================================================================
+
+// Enumeration of buffer access types
+enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
+
+// C++11 version of 'CUdeviceptr'
+template <typename T>
+class Buffer {
+public:
+
+ // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
+ explicit Buffer(const CUdeviceptr buffer):
+ buffer_(new CUdeviceptr),
+ access_(BufferAccess::kNotOwned) {
+ *buffer_ = buffer;
+ }
+
+ // Regular constructor with memory management. If this class does not own the buffer object, then
+ // the memory will not be freed automatically afterwards.
+ explicit Buffer(const Context &, const BufferAccess access, const size_t size):
+ buffer_(new CUdeviceptr, [access](CUdeviceptr* m) {
+ if (access != BufferAccess::kNotOwned) { CheckError(cuMemFree(*m)); }
+ delete m;
+ }),
+ access_(access) {
+ CheckError(cuMemAlloc(buffer_.get(), size*sizeof(T)));
+ }
+
+ // As above, but now with read/write access as a default
+ explicit Buffer(const Context &context, const size_t size):
+ Buffer<T>(context, BufferAccess::kReadWrite, size) {
+ }
+
+ // Constructs a new buffer based on an existing host-container
+ template <typename Iterator>
+ explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
+ Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
+ auto size = static_cast<size_t>(end - start);
+ auto pointer = &*start;
+ CheckError(cuMemcpyHtoDAsync(*buffer_, pointer, size*sizeof(T), queue()));
+ queue.Finish();
+ }
+
+ // Copies from device to host: reading the device buffer a-synchronously
+ void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
+ if (access_ == BufferAccess::kWriteOnly) {
+ throw LogicError("Buffer: reading from a write-only buffer");
+ }
+ CheckError(cuMemcpyDtoHAsync(host, *buffer_ + offset*sizeof(T), size*sizeof(T), queue()));
+ }
+ void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
+ const size_t offset = 0) const {
+ if (host.size() < size) {
+ throw LogicError("Buffer: target host buffer is too small");
+ }
+ ReadAsync(queue, size, host.data(), offset);
+ }
+ void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
+ const size_t offset = 0) const {
+ if (host.size() < size) {
+ throw LogicError("Buffer: target host buffer is too small");
+ }
+ ReadAsync(queue, size, host.data(), offset);
+ }
+
+ // Copies from device to host: reading the device buffer
+ void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
+ ReadAsync(queue, size, host, offset);
+ queue.Finish();
+ }
+ void Read(const Queue &queue, const size_t size, std::vector<T> &host,
+ const size_t offset = 0) const {
+ Read(queue, size, host.data(), offset);
+ }
+ void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
+ const size_t offset = 0) const {
+ Read(queue, size, host.data(), offset);
+ }
+
+ // Copies from host to device: writing the device buffer a-synchronously
+ void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
+ if (access_ == BufferAccess::kReadOnly) {
+ throw LogicError("Buffer: writing to a read-only buffer");
+ }
+ if (GetSize() < (offset+size)*sizeof(T)) {
+ throw LogicError("Buffer: target device buffer is too small");
+ }
+ CheckError(cuMemcpyHtoDAsync(*buffer_ + offset*sizeof(T), host, size*sizeof(T), queue()));
+ }
+ void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
+ const size_t offset = 0) {
+ WriteAsync(queue, size, host.data(), offset);
+ }
+ void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
+ const size_t offset = 0) {
+ WriteAsync(queue, size, host.data(), offset);
+ }
+
+ // Copies from host to device: writing the device buffer
+ void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
+ WriteAsync(queue, size, host, offset);
+ queue.Finish();
+ }
+ void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
+ const size_t offset = 0) {
+ Write(queue, size, host.data(), offset);
+ }
+ void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
+ const size_t offset = 0) {
+ Write(queue, size, host.data(), offset);
+ }
+
+ // Copies the contents of this buffer into another device buffer
+ void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
+ CheckError(cuMemcpyDtoDAsync(destination(), *buffer_, size*sizeof(T), queue()));
+ }
+ void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
+ CopyToAsync(queue, size, destination);
+ queue.Finish();
+ }
+
+ // Retrieves the actual allocated size in bytes
+ size_t GetSize() const {
+ auto result = size_t{0};
+ CheckError(cuMemGetAddressRange(nullptr, &result, *buffer_));
+ return result;
+ }
+
+ // Accessors to the private data-members
+ CUdeviceptr operator()() const { return *buffer_; }
+ CUdeviceptr& operator()() { return *buffer_; }
+private:
+ std::shared_ptr<CUdeviceptr> buffer_;
+ const BufferAccess access_;
+};
+
+// =================================================================================================
+
+// C++11 version of 'CUfunction'
+class Kernel {
+public:
+
+ // Constructor based on the regular CUDA data-type: memory management is handled elsewhere
+ explicit Kernel(const CUmodule module, const CUfunction kernel):
+ module_(module),
+ kernel_(kernel) {
+ }
+
+ // Regular constructor with memory management
+ explicit Kernel(const Program &program, const std::string &name) {
+ CheckError(cuModuleLoadDataEx(&module_, program.GetIR().data(), 0, nullptr, nullptr));
+ CheckError(cuModuleGetFunction(&kernel_, module_, name.c_str()));
+ }
+
+ // Sets a kernel argument at the indicated position. This stores both the value of the argument
+ // (as raw bytes) and the index indicating where this value can be found.
+ template <typename T>
+ void SetArgument(const size_t index, const T &value) {
+ if (index >= arguments_indices_.size()) { arguments_indices_.resize(index+1); }
+ arguments_indices_[index] = arguments_data_.size();
+ for (auto j=size_t(0); j<sizeof(T); ++j) {
+ arguments_data_.push_back(reinterpret_cast<const char*>(&value)[j]);
+ }
+ }
+ template <typename T>
+ void SetArgument(const size_t index, Buffer<T> &value) {
+ SetArgument(index, value());
+ }
+
+ // Sets all arguments in one go using parameter packs. Note that this resets all previously set
+ // arguments using 'SetArgument' or 'SetArguments'.
+ template <typename... Args>
+ void SetArguments(Args&... args) {
+ arguments_indices_.clear();
+ arguments_data_.clear();
+ SetArgumentsRecursive(0, args...);
+ }
+
+ // Retrieves the amount of local memory used per work-group for this kernel. Note that this the
+ // shared memory in CUDA terminology.
+ unsigned long LocalMemUsage(const Device &) const {
+ auto result = 0;
+ CheckError(cuFuncGetAttribute(&result, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel_));
+ return static_cast<unsigned long>(result);
+ }
+
+ // Retrieves the name of the kernel
+ std::string GetFunctionName() const {
+ return std::string{"unknown"}; // Not implemented for the CUDA backend
+ }
+
+ // Launches a kernel onto the specified queue
+ void Launch(const Queue &queue, const std::vector<size_t> &global,
+ const std::vector<size_t> &local, EventPointer event) {
+
+ // Creates the grid (number of threadblocks) and sets the block sizes (threads per block)
+ auto grid = std::vector<size_t>{1, 1, 1};
+ auto block = std::vector<size_t>{1, 1, 1};
+ if (global.size() != local.size()) { throw LogicError("invalid thread/workgroup dimensions"); }
+ for (auto i=size_t{0}; i<local.size(); ++i) { grid[i] = global[i]/local[i]; }
+ for (auto i=size_t{0}; i<local.size(); ++i) { block[i] = local[i]; }
+
+ // Creates the array of pointers from the arrays of indices & data
+ std::vector<void*> pointers;
+ for (auto &index: arguments_indices_) {
+ pointers.push_back(&arguments_data_[index]);
+ }
+
+ // Launches the kernel, its execution time is recorded by events
+ CheckError(cuEventRecord(event->start(), queue()));
+ CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2],
+ 0, queue(), pointers.data(), nullptr));
+ CheckError(cuEventRecord(event->end(), queue()));
+ }
+
+ // As above, but with an event waiting list
+ // TODO: Implement this function
+ void Launch(const Queue &queue, const std::vector<size_t> &global,
+ const std::vector<size_t> &local, EventPointer event,
+ std::vector<Event>& waitForEvents) {
+ if (local.size() == 0) {
+ throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end");
+ }
+ else if (waitForEvents.size() != 0) {
+ throw LogicError("Kernel: launching with an event waiting list is not implemented for the CUDA back-end");
+ }
+ else {
+ return Launch(queue, global, local, event);
+ }
+ }
+
+ // Accessors to the private data-members
+ const CUfunction& operator()() const { return kernel_; }
+ CUfunction operator()() { return kernel_; }
+private:
+ CUmodule module_;
+ CUfunction kernel_;
+ std::vector<size_t> arguments_indices_; // Indices of the arguments
+ std::vector<char> arguments_data_; // The arguments data as raw bytes
+
+ // Internal implementation for the recursive SetArguments function.
+ template <typename T>
+ void SetArgumentsRecursive(const size_t index, T &first) {
+ SetArgument(index, first);
+ }
+ template <typename T, typename... Args>
+ void SetArgumentsRecursive(const size_t index, T &first, Args&... args) {
+ SetArgument(index, first);
+ SetArgumentsRecursive(index+1, args...);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_CUPP11_H_
+#endif
diff --git a/src/utilities/buffer_test.hpp b/src/utilities/buffer_test.hpp
index b5693181..a5b6be4b 100644
--- a/src/utilities/buffer_test.hpp
+++ b/src/utilities/buffer_test.hpp
@@ -15,7 +15,7 @@
#ifndef CLBLAST_BUFFER_TEST_H_
#define CLBLAST_BUFFER_TEST_H_
-#include "clblast.h"
+#include "utilities/utilities.hpp
namespace clblast {
// =================================================================================================
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index b2949c27..f56226be 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -21,8 +21,13 @@
#include <complex>
#include <random>
-#include "clpp11.hpp"
-#include "clblast.h"
+#ifdef OPENCL_API
+ #include "clpp11.hpp"
+ #include "clblast.h"
+#elif CUDA_API
+ #include "cupp11.hpp"
+ #include "clblast_cuda.h"
+#endif
#include "clblast_half.h"
#include "utilities/clblast_exceptions.hpp"
#include "utilities/msvc.hpp"