diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-04-16 19:41:14 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-04-16 19:41:14 +0200 |
commit | 2673f5051820db82ebb857d88c2f36f7cacbed7d (patch) | |
tree | f3323af174bde2793b3c4692f3404d2a18c5eadb /test | |
parent | 063ef729e123aa2cebc7f67c73f99f3e15606fe2 (diff) | |
parent | b20c518f9fd05a69957c2018e72c6a648f5cdb7d (diff) |
Merge branch 'development' into benchmarking
Diffstat (limited to 'test')
157 files changed, 3702 insertions, 1124 deletions
diff --git a/test/correctness/misc/override_parameters.cpp b/test/correctness/misc/override_parameters.cpp index e6eebef7..4283c039 100644 --- a/test/correctness/misc/override_parameters.cpp +++ b/test/correctness/misc/override_parameters.cpp @@ -129,15 +129,11 @@ size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::st // ================================================================================================= } // namespace clblast -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunOverrideTests<float>(argc, argv, false, "SGEMM"); - errors += clblast::RunOverrideTests<float2>(argc, argv, true, "CGEMM"); + errors += clblast::RunOverrideTests<clblast::float2>(argc, argv, true, "CGEMM"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level1/xamax.cpp b/test/correctness/routines/level1/xamax.cpp index 607637e8..d940ae7a 100644 --- a/test/correctness/routines/level1/xamax.cpp +++ b/test/correctness/routines/level1/xamax.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xamax.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXamax<float>, float, float>(argc, argv, false, "iSAMAX"); errors += clblast::RunTests<clblast::TestXamax<double>, double, double>(argc, argv, true, "iDAMAX"); - errors += clblast::RunTests<clblast::TestXamax<float2>, float2, float2>(argc, argv, true, "iCAMAX"); - errors += clblast::RunTests<clblast::TestXamax<double2>, double2, double2>(argc, argv, true, "iZAMAX"); + errors += clblast::RunTests<clblast::TestXamax<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "iCAMAX"); + errors += clblast::RunTests<clblast::TestXamax<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "iZAMAX"); errors += clblast::RunTests<clblast::TestXamax<half>, half, half>(argc, argv, true, "iHAMAX"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level1/xasum.cpp b/test/correctness/routines/level1/xasum.cpp index e22e42a6..b969d662 100644 --- a/test/correctness/routines/level1/xasum.cpp +++ b/test/correctness/routines/level1/xasum.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xasum.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXasum<float>, float, float>(argc, argv, false, "SASUM"); errors += clblast::RunTests<clblast::TestXasum<double>, double, double>(argc, argv, true, "DASUM"); - errors += clblast::RunTests<clblast::TestXasum<float2>, float2, float2>(argc, argv, true, "ScASUM"); - errors += clblast::RunTests<clblast::TestXasum<double2>, double2, double2>(argc, argv, true, "DzASUM"); + errors += clblast::RunTests<clblast::TestXasum<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "ScASUM"); + errors += clblast::RunTests<clblast::TestXasum<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "DzASUM"); errors += clblast::RunTests<clblast::TestXasum<half>, half, half>(argc, argv, true, "HASUM"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level1/xaxpy.cpp b/test/correctness/routines/level1/xaxpy.cpp index 064172fa..6f4f34fb 100644 --- a/test/correctness/routines/level1/xaxpy.cpp +++ b/test/correctness/routines/level1/xaxpy.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xaxpy.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXaxpy<float>, float, float>(argc, argv, false, "SAXPY"); errors += clblast::RunTests<clblast::TestXaxpy<double>, double, double>(argc, argv, true, "DAXPY"); - errors += clblast::RunTests<clblast::TestXaxpy<float2>, float2, float2>(argc, argv, true, "CAXPY"); - errors += clblast::RunTests<clblast::TestXaxpy<double2>, double2, double2>(argc, argv, true, "ZAXPY"); + errors += clblast::RunTests<clblast::TestXaxpy<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CAXPY"); + errors += clblast::RunTests<clblast::TestXaxpy<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZAXPY"); errors += clblast::RunTests<clblast::TestXaxpy<half>, half, half>(argc, argv, true, "HAXPY"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level1/xcopy.cpp b/test/correctness/routines/level1/xcopy.cpp index e6f2581b..e6e94d34 100644 --- a/test/correctness/routines/level1/xcopy.cpp +++ b/test/correctness/routines/level1/xcopy.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xcopy.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXcopy<float>, float, float>(argc, argv, false, "SCOPY"); errors += clblast::RunTests<clblast::TestXcopy<double>, double, double>(argc, argv, true, "DCOPY"); - errors += clblast::RunTests<clblast::TestXcopy<float2>, float2, float2>(argc, argv, true, "CCOPY"); - errors += clblast::RunTests<clblast::TestXcopy<double2>, double2, double2>(argc, argv, true, "ZCOPY"); + errors += clblast::RunTests<clblast::TestXcopy<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CCOPY"); + errors += clblast::RunTests<clblast::TestXcopy<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZCOPY"); errors += clblast::RunTests<clblast::TestXcopy<half>, half, half>(argc, argv, true, "HCOPY"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level1/xdot.cpp b/test/correctness/routines/level1/xdot.cpp index 080250cb..8dccbf26 100644 --- a/test/correctness/routines/level1/xdot.cpp +++ b/test/correctness/routines/level1/xdot.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xdot.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level1/xdotc.cpp b/test/correctness/routines/level1/xdotc.cpp index 2a7bbeca..59eedddc 100644 --- a/test/correctness/routines/level1/xdotc.cpp +++ b/test/correctness/routines/level1/xdotc.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xdotc.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXdotc<float2>, float2, float2>(argc, argv, false, "CDOTC"); - errors += clblast::RunTests<clblast::TestXdotc<double2>, double2, double2>(argc, argv, true, "ZDOTC"); + errors += clblast::RunTests<clblast::TestXdotc<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CDOTC"); + errors += clblast::RunTests<clblast::TestXdotc<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZDOTC"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level1/xdotu.cpp b/test/correctness/routines/level1/xdotu.cpp index 1047d021..4392326d 100644 --- a/test/correctness/routines/level1/xdotu.cpp +++ b/test/correctness/routines/level1/xdotu.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xdotu.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXdotu<float2>, float2, float2>(argc, argv, false, "CDOTU"); - errors += clblast::RunTests<clblast::TestXdotu<double2>, double2, double2>(argc, argv, true, "ZDOTU"); + errors += clblast::RunTests<clblast::TestXdotu<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CDOTU"); + errors += clblast::RunTests<clblast::TestXdotu<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZDOTU"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level1/xnrm2.cpp b/test/correctness/routines/level1/xnrm2.cpp index 142fa7ba..46ca1526 100644 --- a/test/correctness/routines/level1/xnrm2.cpp +++ b/test/correctness/routines/level1/xnrm2.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xnrm2.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXnrm2<float>, float, float>(argc, argv, false, "SNRM2"); errors += clblast::RunTests<clblast::TestXnrm2<double>, double, double>(argc, argv, true, "DNRM2"); - errors += clblast::RunTests<clblast::TestXnrm2<float2>, float2, float2>(argc, argv, true, "ScNRM2"); - errors += clblast::RunTests<clblast::TestXnrm2<double2>, double2, double2>(argc, argv, true, "DzNRM2"); + errors += clblast::RunTests<clblast::TestXnrm2<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "ScNRM2"); + errors += clblast::RunTests<clblast::TestXnrm2<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "DzNRM2"); errors += clblast::RunTests<clblast::TestXnrm2<half>, half, half>(argc, argv, true, "HNRM2"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level1/xrot.cpp b/test/correctness/routines/level1/xrot.cpp index 5af358eb..d5eb6516 100644 --- a/test/correctness/routines/level1/xrot.cpp +++ b/test/correctness/routines/level1/xrot.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xrot.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level1/xrotg.cpp b/test/correctness/routines/level1/xrotg.cpp index ad23a554..ec544eab 100644 --- a/test/correctness/routines/level1/xrotg.cpp +++ b/test/correctness/routines/level1/xrotg.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xrotg.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level1/xrotm.cpp b/test/correctness/routines/level1/xrotm.cpp index 4f7e8f15..7f2d7ce6 100644 --- a/test/correctness/routines/level1/xrotm.cpp +++ b/test/correctness/routines/level1/xrotm.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xrotm.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level1/xrotmg.cpp b/test/correctness/routines/level1/xrotmg.cpp index ca89bc12..4ef6e67d 100644 --- a/test/correctness/routines/level1/xrotmg.cpp +++ b/test/correctness/routines/level1/xrotmg.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xrotmg.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level1/xscal.cpp b/test/correctness/routines/level1/xscal.cpp index 939524be..c9788142 100644 --- a/test/correctness/routines/level1/xscal.cpp +++ b/test/correctness/routines/level1/xscal.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xscal.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXscal<float>, float, float>(argc, argv, false, "SSCAL"); errors += clblast::RunTests<clblast::TestXscal<double>, double, double>(argc, argv, true, "DSCAL"); - errors += clblast::RunTests<clblast::TestXscal<float2>, float2, float2>(argc, argv, true, "CSCAL"); - errors += clblast::RunTests<clblast::TestXscal<double2>, double2, double2>(argc, argv, true, "ZSCAL"); + errors += clblast::RunTests<clblast::TestXscal<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CSCAL"); + errors += clblast::RunTests<clblast::TestXscal<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZSCAL"); errors += clblast::RunTests<clblast::TestXscal<half>, half, half>(argc, argv, true, "HSCAL"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level1/xswap.cpp b/test/correctness/routines/level1/xswap.cpp index 446f3d65..ee694a08 100644 --- a/test/correctness/routines/level1/xswap.cpp +++ b/test/correctness/routines/level1/xswap.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level1/xswap.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXswap<float>, float, float>(argc, argv, false, "SSWAP"); errors += clblast::RunTests<clblast::TestXswap<double>, double, double>(argc, argv, true, "DSWAP"); - errors += clblast::RunTests<clblast::TestXswap<float2>, float2, float2>(argc, argv, true, "CSWAP"); - errors += clblast::RunTests<clblast::TestXswap<double2>, double2, double2>(argc, argv, true, "ZSWAP"); + errors += clblast::RunTests<clblast::TestXswap<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CSWAP"); + errors += clblast::RunTests<clblast::TestXswap<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZSWAP"); errors += clblast::RunTests<clblast::TestXswap<half>, half, half>(argc, argv, true, "HSWAP"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xgbmv.cpp b/test/correctness/routines/level2/xgbmv.cpp index 8c49bc65..6aac283b 100644 --- a/test/correctness/routines/level2/xgbmv.cpp +++ b/test/correctness/routines/level2/xgbmv.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xgbmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXgbmv<float>, float, float>(argc, argv, false, "SGBMV"); errors += clblast::RunTests<clblast::TestXgbmv<double>, double, double>(argc, argv, true, "DGBMV"); - errors += clblast::RunTests<clblast::TestXgbmv<float2>, float2, float2>(argc, argv, true, "CGBMV"); - errors += clblast::RunTests<clblast::TestXgbmv<double2>, double2, double2>(argc, argv, true, "ZGBMV"); + errors += clblast::RunTests<clblast::TestXgbmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGBMV"); + errors += clblast::RunTests<clblast::TestXgbmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGBMV"); errors += clblast::RunTests<clblast::TestXgbmv<half>, half, half>(argc, argv, true, "HGBMV"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xgemv.cpp b/test/correctness/routines/level2/xgemv.cpp index 902ae777..66994b89 100644 --- a/test/correctness/routines/level2/xgemv.cpp +++ b/test/correctness/routines/level2/xgemv.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xgemv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXgemv<float>, float, float>(argc, argv, false, "SGEMV"); errors += clblast::RunTests<clblast::TestXgemv<double>, double, double>(argc, argv, true, "DGEMV"); - errors += clblast::RunTests<clblast::TestXgemv<float2>, float2, float2>(argc, argv, true, "CGEMV"); - errors += clblast::RunTests<clblast::TestXgemv<double2>, double2, double2>(argc, argv, true, "ZGEMV"); + errors += clblast::RunTests<clblast::TestXgemv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMV"); + errors += clblast::RunTests<clblast::TestXgemv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMV"); errors += clblast::RunTests<clblast::TestXgemv<half>, half, half>(argc, argv, true, "HGEMV"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xger.cpp b/test/correctness/routines/level2/xger.cpp index ce61bbcb..3b5d16e9 100644 --- a/test/correctness/routines/level2/xger.cpp +++ b/test/correctness/routines/level2/xger.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xger.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level2/xgerc.cpp b/test/correctness/routines/level2/xgerc.cpp index b747f20d..42f6bb45 100644 --- a/test/correctness/routines/level2/xgerc.cpp +++ b/test/correctness/routines/level2/xgerc.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xgerc.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXgerc<float2>, float2, float2>(argc, argv, false, "CGERC"); - errors += clblast::RunTests<clblast::TestXgerc<double2>, double2, double2>(argc, argv, true, "ZGERC"); + errors += clblast::RunTests<clblast::TestXgerc<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CGERC"); + errors += clblast::RunTests<clblast::TestXgerc<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGERC"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xgeru.cpp b/test/correctness/routines/level2/xgeru.cpp index f80c1e2b..f167eff5 100644 --- a/test/correctness/routines/level2/xgeru.cpp +++ b/test/correctness/routines/level2/xgeru.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xgeru.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXgeru<float2>, float2, float2>(argc, argv, false, "CGERU"); - errors += clblast::RunTests<clblast::TestXgeru<double2>, double2, double2>(argc, argv, true, "ZGERU"); + errors += clblast::RunTests<clblast::TestXgeru<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CGERU"); + errors += clblast::RunTests<clblast::TestXgeru<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGERU"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xhbmv.cpp b/test/correctness/routines/level2/xhbmv.cpp index a4885c01..168d9474 100644 --- a/test/correctness/routines/level2/xhbmv.cpp +++ b/test/correctness/routines/level2/xhbmv.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xhbmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXhbmv<float2>, float2, float2>(argc, argv, false, "CHBMV"); - errors += clblast::RunTests<clblast::TestXhbmv<double2>, double2, double2>(argc, argv, true, "ZHBMV"); + errors += clblast::RunTests<clblast::TestXhbmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CHBMV"); + errors += clblast::RunTests<clblast::TestXhbmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHBMV"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xhemv.cpp b/test/correctness/routines/level2/xhemv.cpp index 4318ffee..eabdf67d 100644 --- a/test/correctness/routines/level2/xhemv.cpp +++ b/test/correctness/routines/level2/xhemv.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xhemv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXhemv<float2>, float2, float2>(argc, argv, false, "CHEMV"); - errors += clblast::RunTests<clblast::TestXhemv<double2>, double2, double2>(argc, argv, true, "ZHEMV"); + errors += clblast::RunTests<clblast::TestXhemv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CHEMV"); + errors += clblast::RunTests<clblast::TestXhemv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHEMV"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xher.cpp b/test/correctness/routines/level2/xher.cpp index fe37bd76..a47a45ac 100644 --- a/test/correctness/routines/level2/xher.cpp +++ b/test/correctness/routines/level2/xher.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xher.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXher<float2,float>, float2, float>(argc, argv, false, "CHER"); - errors += clblast::RunTests<clblast::TestXher<double2,double>, double2, double>(argc, argv, true, "ZHER"); + errors += clblast::RunTests<clblast::TestXher<clblast::float2,float>, clblast::float2, float>(argc, argv, false, "CHER"); + errors += clblast::RunTests<clblast::TestXher<clblast::double2,double>, clblast::double2, double>(argc, argv, true, "ZHER"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xher2.cpp b/test/correctness/routines/level2/xher2.cpp index 0b4af4d0..544ab16d 100644 --- a/test/correctness/routines/level2/xher2.cpp +++ b/test/correctness/routines/level2/xher2.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xher2.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXher2<float2>, float2, float2>(argc, argv, false, "CHER2"); - errors += clblast::RunTests<clblast::TestXher2<double2>, double2, double2>(argc, argv, true, "ZHER2"); + errors += clblast::RunTests<clblast::TestXher2<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CHER2"); + errors += clblast::RunTests<clblast::TestXher2<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHER2"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xhpmv.cpp b/test/correctness/routines/level2/xhpmv.cpp index dd77df71..30d23b8f 100644 --- a/test/correctness/routines/level2/xhpmv.cpp +++ b/test/correctness/routines/level2/xhpmv.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xhpmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXhpmv<float2>, float2, float2>(argc, argv, false, "CHPMV"); - errors += clblast::RunTests<clblast::TestXhpmv<double2>, double2, double2>(argc, argv, true, "ZHPMV"); + errors += clblast::RunTests<clblast::TestXhpmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CHPMV"); + errors += clblast::RunTests<clblast::TestXhpmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHPMV"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xhpr.cpp b/test/correctness/routines/level2/xhpr.cpp index 5a3f615f..ed876857 100644 --- a/test/correctness/routines/level2/xhpr.cpp +++ b/test/correctness/routines/level2/xhpr.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xhpr.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXhpr<float2,float>, float2, float>(argc, argv, false, "CHPR"); - errors += clblast::RunTests<clblast::TestXhpr<double2,double>, double2, double>(argc, argv, true, "ZHPR"); + errors += clblast::RunTests<clblast::TestXhpr<clblast::float2,float>, clblast::float2, float>(argc, argv, false, "CHPR"); + errors += clblast::RunTests<clblast::TestXhpr<clblast::double2,double>, clblast::double2, double>(argc, argv, true, "ZHPR"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xhpr2.cpp b/test/correctness/routines/level2/xhpr2.cpp index 8218b444..b3bd167a 100644 --- a/test/correctness/routines/level2/xhpr2.cpp +++ b/test/correctness/routines/level2/xhpr2.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xhpr2.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXhpr2<float2>, float2, float2>(argc, argv, false, "CHPR2"); - errors += clblast::RunTests<clblast::TestXhpr2<double2>, double2, double2>(argc, argv, true, "ZHPR2"); + errors += clblast::RunTests<clblast::TestXhpr2<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CHPR2"); + errors += clblast::RunTests<clblast::TestXhpr2<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHPR2"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xsbmv.cpp b/test/correctness/routines/level2/xsbmv.cpp index 7918cb21..3b6b3972 100644 --- a/test/correctness/routines/level2/xsbmv.cpp +++ b/test/correctness/routines/level2/xsbmv.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xsbmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level2/xspmv.cpp b/test/correctness/routines/level2/xspmv.cpp index 78210660..9dccdbc1 100644 --- a/test/correctness/routines/level2/xspmv.cpp +++ b/test/correctness/routines/level2/xspmv.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xspmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level2/xspr.cpp b/test/correctness/routines/level2/xspr.cpp index d05adf34..9cf242c1 100644 --- a/test/correctness/routines/level2/xspr.cpp +++ b/test/correctness/routines/level2/xspr.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xspr.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level2/xspr2.cpp b/test/correctness/routines/level2/xspr2.cpp index caa46a09..2650bd03 100644 --- a/test/correctness/routines/level2/xspr2.cpp +++ b/test/correctness/routines/level2/xspr2.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xspr2.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level2/xsymv.cpp b/test/correctness/routines/level2/xsymv.cpp index 978a5f8a..3f0a8f8b 100644 --- a/test/correctness/routines/level2/xsymv.cpp +++ b/test/correctness/routines/level2/xsymv.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xsymv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level2/xsyr.cpp b/test/correctness/routines/level2/xsyr.cpp index 244dbfb4..15ac1f14 100644 --- a/test/correctness/routines/level2/xsyr.cpp +++ b/test/correctness/routines/level2/xsyr.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xsyr.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level2/xsyr2.cpp b/test/correctness/routines/level2/xsyr2.cpp index 422e67ad..74806219 100644 --- a/test/correctness/routines/level2/xsyr2.cpp +++ b/test/correctness/routines/level2/xsyr2.cpp @@ -12,10 +12,6 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xsyr2.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; diff --git a/test/correctness/routines/level2/xtbmv.cpp b/test/correctness/routines/level2/xtbmv.cpp index 491708ec..667ae732 100644 --- a/test/correctness/routines/level2/xtbmv.cpp +++ b/test/correctness/routines/level2/xtbmv.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xtbmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXtbmv<float>, float, float>(argc, argv, false, "STBMV"); errors += clblast::RunTests<clblast::TestXtbmv<double>, double, double>(argc, argv, true, "DTBMV"); - errors += clblast::RunTests<clblast::TestXtbmv<float2>, float2, float2>(argc, argv, true, "CTBMV"); - errors += clblast::RunTests<clblast::TestXtbmv<double2>, double2, double2>(argc, argv, true, "ZTBMV"); + errors += clblast::RunTests<clblast::TestXtbmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTBMV"); + errors += clblast::RunTests<clblast::TestXtbmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTBMV"); errors += clblast::RunTests<clblast::TestXtbmv<half>, half, half>(argc, argv, true, "HTBMV"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xtbsv.cpp b/test/correctness/routines/level2/xtbsv.cpp index 12b5dca5..5cfc6942 100644 --- a/test/correctness/routines/level2/xtbsv.cpp +++ b/test/correctness/routines/level2/xtbsv.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xtbsv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXtbsv<float>, float, float>(argc, argv, false, "STBSV"); errors += clblast::RunTests<clblast::TestXtbsv<double>, double, double>(argc, argv, true, "DTBSV"); - errors += clblast::RunTests<clblast::TestXtbsv<float2>, float2, float2>(argc, argv, true, "CTBSV"); - errors += clblast::RunTests<clblast::TestXtbsv<double2>, double2, double2>(argc, argv, true, "ZTBSV"); + errors += clblast::RunTests<clblast::TestXtbsv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTBSV"); + errors += clblast::RunTests<clblast::TestXtbsv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTBSV"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xtpmv.cpp b/test/correctness/routines/level2/xtpmv.cpp index b89f0adc..89056678 100644 --- a/test/correctness/routines/level2/xtpmv.cpp +++ b/test/correctness/routines/level2/xtpmv.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xtpmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXtpmv<float>, float, float>(argc, argv, false, "STPMV"); errors += clblast::RunTests<clblast::TestXtpmv<double>, double, double>(argc, argv, true, "DTPMV"); - errors += clblast::RunTests<clblast::TestXtpmv<float2>, float2, float2>(argc, argv, true, "CTPMV"); - errors += clblast::RunTests<clblast::TestXtpmv<double2>, double2, double2>(argc, argv, true, "ZTPMV"); + errors += clblast::RunTests<clblast::TestXtpmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTPMV"); + errors += clblast::RunTests<clblast::TestXtpmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTPMV"); errors += clblast::RunTests<clblast::TestXtpmv<half>, half, half>(argc, argv, true, "HTPMV"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xtpsv.cpp b/test/correctness/routines/level2/xtpsv.cpp index 6e6e7c85..28c9fe39 100644 --- a/test/correctness/routines/level2/xtpsv.cpp +++ b/test/correctness/routines/level2/xtpsv.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xtpsv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXtpsv<float>, float, float>(argc, argv, false, "STPSV"); errors += clblast::RunTests<clblast::TestXtpsv<double>, double, double>(argc, argv, true, "DTPSV"); - errors += clblast::RunTests<clblast::TestXtpsv<float2>, float2, float2>(argc, argv, true, "CTPSV"); - errors += clblast::RunTests<clblast::TestXtpsv<double2>, double2, double2>(argc, argv, true, "ZTPSV"); + errors += clblast::RunTests<clblast::TestXtpsv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTPSV"); + errors += clblast::RunTests<clblast::TestXtpsv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTPSV"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xtrmv.cpp b/test/correctness/routines/level2/xtrmv.cpp index 819f5cad..b1a414af 100644 --- a/test/correctness/routines/level2/xtrmv.cpp +++ b/test/correctness/routines/level2/xtrmv.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xtrmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXtrmv<float>, float, float>(argc, argv, false, "STRMV"); errors += clblast::RunTests<clblast::TestXtrmv<double>, double, double>(argc, argv, true, "DTRMV"); - errors += clblast::RunTests<clblast::TestXtrmv<float2>, float2, float2>(argc, argv, true, "CTRMV"); - errors += clblast::RunTests<clblast::TestXtrmv<double2>, double2, double2>(argc, argv, true, "ZTRMV"); + errors += clblast::RunTests<clblast::TestXtrmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTRMV"); + errors += clblast::RunTests<clblast::TestXtrmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTRMV"); errors += clblast::RunTests<clblast::TestXtrmv<half>, half, half>(argc, argv, true, "HTRMV"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level2/xtrsv.cpp b/test/correctness/routines/level2/xtrsv.cpp index 78e33807..b35d7fc7 100644 --- a/test/correctness/routines/level2/xtrsv.cpp +++ b/test/correctness/routines/level2/xtrsv.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level2/xtrsv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXtrsv<float>, float, float>(argc, argv, false, "STRSV"); errors += clblast::RunTests<clblast::TestXtrsv<double>, double, double>(argc, argv, true, "DTRSV"); - errors += clblast::RunTests<clblast::TestXtrsv<float2>, float2, float2>(argc, argv, true, "CTRSV"); - errors += clblast::RunTests<clblast::TestXtrsv<double2>, double2, double2>(argc, argv, true, "ZTRSV"); + errors += clblast::RunTests<clblast::TestXtrsv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTRSV"); + errors += clblast::RunTests<clblast::TestXtrsv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTRSV"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level3/xgemm.cpp b/test/correctness/routines/level3/xgemm.cpp index 54d41719..7fda5f2d 100644 --- a/test/correctness/routines/level3/xgemm.cpp +++ b/test/correctness/routines/level3/xgemm.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level3/xgemm.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXgemm<float>, float, float>(argc, argv, false, "SGEMM"); errors += clblast::RunTests<clblast::TestXgemm<double>, double, double>(argc, argv, true, "DGEMM"); - errors += clblast::RunTests<clblast::TestXgemm<float2>, float2, float2>(argc, argv, true, "CGEMM"); - errors += clblast::RunTests<clblast::TestXgemm<double2>, double2, double2>(argc, argv, true, "ZGEMM"); + errors += clblast::RunTests<clblast::TestXgemm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM"); + errors += clblast::RunTests<clblast::TestXgemm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM"); errors += clblast::RunTests<clblast::TestXgemm<half>, half, half>(argc, argv, true, "HGEMM"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level3/xhemm.cpp b/test/correctness/routines/level3/xhemm.cpp index 76c970a7..cbd277e2 100644 --- a/test/correctness/routines/level3/xhemm.cpp +++ b/test/correctness/routines/level3/xhemm.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level3/xhemm.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXhemm<float2>, float2, float2>(argc, argv, false, "CHEMM"); - errors += clblast::RunTests<clblast::TestXhemm<double2>, double2, double2>(argc, argv, true, "ZHEMM"); + errors += clblast::RunTests<clblast::TestXhemm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, false, "CHEMM"); + errors += clblast::RunTests<clblast::TestXhemm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZHEMM"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level3/xher2k.cpp b/test/correctness/routines/level3/xher2k.cpp index c653265e..e21a429c 100644 --- a/test/correctness/routines/level3/xher2k.cpp +++ b/test/correctness/routines/level3/xher2k.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level3/xher2k.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXher2k<float2,float>, float2, float>(argc, argv, false, "CHER2K"); - errors += clblast::RunTests<clblast::TestXher2k<double2,double>, double2, double>(argc, argv, true, "ZHER2K"); + errors += clblast::RunTests<clblast::TestXher2k<clblast::float2,float>, clblast::float2, float>(argc, argv, false, "CHER2K"); + errors += clblast::RunTests<clblast::TestXher2k<clblast::double2,double>, clblast::double2, double>(argc, argv, true, "ZHER2K"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level3/xherk.cpp b/test/correctness/routines/level3/xherk.cpp index 09ea9e4d..5665147e 100644 --- a/test/correctness/routines/level3/xherk.cpp +++ b/test/correctness/routines/level3/xherk.cpp @@ -12,15 +12,11 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level3/xherk.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXherk<float2,float>, float2, float>(argc, argv, false, "CHERK"); - errors += clblast::RunTests<clblast::TestXherk<double2,double>, double2, double>(argc, argv, true, "ZHERK"); + errors += clblast::RunTests<clblast::TestXherk<clblast::float2,float>, clblast::float2, float>(argc, argv, false, "CHERK"); + errors += clblast::RunTests<clblast::TestXherk<clblast::double2,double>, clblast::double2, double>(argc, argv, true, "ZHERK"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level3/xsymm.cpp b/test/correctness/routines/level3/xsymm.cpp index 3cb3515a..3e745d24 100644 --- a/test/correctness/routines/level3/xsymm.cpp +++ b/test/correctness/routines/level3/xsymm.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level3/xsymm.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXsymm<float>, float, float>(argc, argv, false, "SSYMM"); errors += clblast::RunTests<clblast::TestXsymm<double>, double, double>(argc, argv, true, "DSYMM"); - errors += clblast::RunTests<clblast::TestXsymm<float2>, float2, float2>(argc, argv, true, "CSYMM"); - errors += clblast::RunTests<clblast::TestXsymm<double2>, double2, double2>(argc, argv, true, "ZSYMM"); + errors += clblast::RunTests<clblast::TestXsymm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CSYMM"); + errors += clblast::RunTests<clblast::TestXsymm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZSYMM"); errors += clblast::RunTests<clblast::TestXsymm<half>, half, half>(argc, argv, true, "HSYMM"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level3/xsyr2k.cpp b/test/correctness/routines/level3/xsyr2k.cpp index 617af04d..b3027063 100644 --- a/test/correctness/routines/level3/xsyr2k.cpp +++ b/test/correctness/routines/level3/xsyr2k.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level3/xsyr2k.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXsyr2k<float>, float, float>(argc, argv, false, "SSYR2K"); errors += clblast::RunTests<clblast::TestXsyr2k<double>, double, double>(argc, argv, true, "DSYR2K"); - errors += clblast::RunTests<clblast::TestXsyr2k<float2>, float2, float2>(argc, argv, true, "CSYR2K"); - errors += clblast::RunTests<clblast::TestXsyr2k<double2>, double2, double2>(argc, argv, true, "ZSYR2K"); + errors += clblast::RunTests<clblast::TestXsyr2k<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CSYR2K"); + errors += clblast::RunTests<clblast::TestXsyr2k<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZSYR2K"); errors += clblast::RunTests<clblast::TestXsyr2k<half>, half, half>(argc, argv, true, "HSYR2K"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level3/xsyrk.cpp b/test/correctness/routines/level3/xsyrk.cpp index 2014b8d0..26c0db41 100644 --- a/test/correctness/routines/level3/xsyrk.cpp +++ b/test/correctness/routines/level3/xsyrk.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level3/xsyrk.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXsyrk<float>, float, float>(argc, argv, false, "SSYRK"); errors += clblast::RunTests<clblast::TestXsyrk<double>, double, double>(argc, argv, true, "DSYRK"); - errors += clblast::RunTests<clblast::TestXsyrk<float2>, float2, float2>(argc, argv, true, "CSYRK"); - errors += clblast::RunTests<clblast::TestXsyrk<double2>, double2, double2>(argc, argv, true, "ZSYRK"); + errors += clblast::RunTests<clblast::TestXsyrk<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CSYRK"); + errors += clblast::RunTests<clblast::TestXsyrk<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZSYRK"); errors += clblast::RunTests<clblast::TestXsyrk<half>, half, half>(argc, argv, true, "HSYRK"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level3/xtrmm.cpp b/test/correctness/routines/level3/xtrmm.cpp index 32640d52..63d17ed5 100644 --- a/test/correctness/routines/level3/xtrmm.cpp +++ b/test/correctness/routines/level3/xtrmm.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level3/xtrmm.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXtrmm<float>, float, float>(argc, argv, false, "STRMM"); errors += clblast::RunTests<clblast::TestXtrmm<double>, double, double>(argc, argv, true, "DTRMM"); - errors += clblast::RunTests<clblast::TestXtrmm<float2>, float2, float2>(argc, argv, true, "CTRMM"); - errors += clblast::RunTests<clblast::TestXtrmm<double2>, double2, double2>(argc, argv, true, "ZTRMM"); + errors += clblast::RunTests<clblast::TestXtrmm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTRMM"); + errors += clblast::RunTests<clblast::TestXtrmm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTRMM"); errors += clblast::RunTests<clblast::TestXtrmm<half>, half, half>(argc, argv, true, "HTRMM"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/level3/xtrsm.cpp b/test/correctness/routines/level3/xtrsm.cpp index bc45a8bf..dcc20060 100644 --- a/test/correctness/routines/level3/xtrsm.cpp +++ b/test/correctness/routines/level3/xtrsm.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/level3/xtrsm.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXtrsm<float>, float, float>(argc, argv, false, "STRSM"); errors += clblast::RunTests<clblast::TestXtrsm<double>, double, double>(argc, argv, true, "DTRSM"); - errors += clblast::RunTests<clblast::TestXtrsm<float2>, float2, float2>(argc, argv, true, "CTRSM"); - errors += clblast::RunTests<clblast::TestXtrsm<double2>, double2, double2>(argc, argv, true, "ZTRSM"); + errors += clblast::RunTests<clblast::TestXtrsm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CTRSM"); + errors += clblast::RunTests<clblast::TestXtrsm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZTRSM"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/levelx/xaxpybatched.cpp b/test/correctness/routines/levelx/xaxpybatched.cpp index a106440f..3b906217 100644 --- a/test/correctness/routines/levelx/xaxpybatched.cpp +++ b/test/correctness/routines/levelx/xaxpybatched.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/levelx/xaxpybatched.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXaxpyBatched<float>, float, float>(argc, argv, false, "SAXPYBATCHED"); errors += clblast::RunTests<clblast::TestXaxpyBatched<double>, double, double>(argc, argv, true, "DAXPYBATCHED"); - errors += clblast::RunTests<clblast::TestXaxpyBatched<float2>, float2, float2>(argc, argv, true, "CAXPYBATCHED"); - errors += clblast::RunTests<clblast::TestXaxpyBatched<double2>, double2, double2>(argc, argv, true, "ZAXPYBATCHED"); + errors += clblast::RunTests<clblast::TestXaxpyBatched<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CAXPYBATCHED"); + errors += clblast::RunTests<clblast::TestXaxpyBatched<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZAXPYBATCHED"); errors += clblast::RunTests<clblast::TestXaxpyBatched<half>, half, half>(argc, argv, true, "HAXPYBATCHED"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/levelx/xgemmbatched.cpp b/test/correctness/routines/levelx/xgemmbatched.cpp index 748e1bb7..1e931fd5 100644 --- a/test/correctness/routines/levelx/xgemmbatched.cpp +++ b/test/correctness/routines/levelx/xgemmbatched.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/levelx/xgemmbatched.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXgemmBatched<float>, float, float>(argc, argv, false, "SGEMMBATCHED"); errors += clblast::RunTests<clblast::TestXgemmBatched<double>, double, double>(argc, argv, true, "DGEMMBATCHED"); - errors += clblast::RunTests<clblast::TestXgemmBatched<float2>, float2, float2>(argc, argv, true, "CGEMMBATCHED"); - errors += clblast::RunTests<clblast::TestXgemmBatched<double2>, double2, double2>(argc, argv, true, "ZGEMMBATCHED"); + errors += clblast::RunTests<clblast::TestXgemmBatched<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMMBATCHED"); + errors += clblast::RunTests<clblast::TestXgemmBatched<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMMBATCHED"); errors += clblast::RunTests<clblast::TestXgemmBatched<half>, half, half>(argc, argv, true, "HGEMMBATCHED"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/levelx/xomatcopy.cpp b/test/correctness/routines/levelx/xomatcopy.cpp index e034bc18..f512432b 100644 --- a/test/correctness/routines/levelx/xomatcopy.cpp +++ b/test/correctness/routines/levelx/xomatcopy.cpp @@ -12,17 +12,13 @@ #include "test/correctness/testblas.hpp" #include "test/routines/levelx/xomatcopy.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; errors += clblast::RunTests<clblast::TestXomatcopy<float>, float, float>(argc, argv, false, "SOMATCOPY"); errors += clblast::RunTests<clblast::TestXomatcopy<double>, double, double>(argc, argv, true, "DOMATCOPY"); - errors += clblast::RunTests<clblast::TestXomatcopy<float2>, float2, float2>(argc, argv, true, "COMATCOPY"); - errors += clblast::RunTests<clblast::TestXomatcopy<double2>, double2, double2>(argc, argv, true, "ZOMATCOPY"); + errors += clblast::RunTests<clblast::TestXomatcopy<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "COMATCOPY"); + errors += clblast::RunTests<clblast::TestXomatcopy<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZOMATCOPY"); errors += clblast::RunTests<clblast::TestXomatcopy<half>, half, half>(argc, argv, true, "HOMATCOPY"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp index 40784fdb..d1f3cbb2 100644 --- a/test/correctness/tester.cpp +++ b/test/correctness/tester.cpp @@ -116,24 +116,44 @@ Tester<T,U>::Tester(const std::vector<std::string> &arguments, const bool silent tests_failed_{0} { options_ = options; + // Determines which reference is the default + #if defined(CLBLAST_REF_CBLAS) + auto default_cblas = 0; + #endif + #if defined(CLBLAST_REF_CLBLAS) + auto default_clblas = 0; + #endif + #if defined(CLBLAST_REF_CUBLAS) + auto default_cublas = 0; + #endif + #if defined(CLBLAST_REF_CBLAS) + default_cblas = 1; + #elif defined(CLBLAST_REF_CLBLAS) + default_clblas = 1; + #elif defined(CLBLAST_REF_CUBLAS) + default_cublas = 1; + #endif + // Determines which reference to test against - #if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS) - compare_clblas_ = GetArgument(arguments, help_, kArgCompareclblas, 0); - compare_cblas_ = GetArgument(arguments, help_, kArgComparecblas, 1); - #elif CLBLAST_REF_CLBLAS - compare_clblas_ = GetArgument(arguments, help_, kArgCompareclblas, 1); - compare_cblas_ = 0; - #elif CLBLAST_REF_CBLAS - compare_clblas_ = 0; - compare_cblas_ = GetArgument(arguments, help_, kArgComparecblas, 1); - #else - compare_clblas_ = 0; - compare_cblas_ = 0; + compare_clblas_ = 0; + compare_cblas_ = 0; + compare_cublas_ = 0; + #if defined(CLBLAST_REF_CBLAS) + compare_cblas_ = GetArgument(arguments, help_, kArgComparecblas, default_cblas); + #endif + #if defined(CLBLAST_REF_CLBLAS) + compare_clblas_ = GetArgument(arguments, help_, kArgCompareclblas, default_clblas); + #endif + #if defined(CLBLAST_REF_CUBLAS) + compare_cublas_ = GetArgument(arguments, help_, kArgComparecublas, default_cublas); #endif // Prints the help message (command-line arguments) if (!silent) { fprintf(stdout, "\n* %s\n", help_.c_str()); } + // Support for cuBLAS not available yet + if (compare_cublas_) { throw std::runtime_error("Cannot test against cuBLAS; not implemented yet"); } + // Can only test against a single reference (not two, not zero) if (compare_clblas_ && compare_cblas_) { throw std::runtime_error("Cannot test against both clBLAS and CBLAS references; choose one using the -cblas and -clblas arguments"); diff --git a/test/correctness/tester.hpp b/test/correctness/tester.hpp index f60be04b..8cfa702f 100644 --- a/test/correctness/tester.hpp +++ b/test/correctness/tester.hpp @@ -113,6 +113,7 @@ class Tester { // Testing against reference implementations int compare_cblas_; int compare_clblas_; + int compare_cublas_; private: diff --git a/test/performance/client.cpp b/test/performance/client.cpp index 48d6708e..dc98ffbd 100644 --- a/test/performance/client.cpp +++ b/test/performance/client.cpp @@ -30,13 +30,14 @@ template <typename T, typename U> const int Client<T,U>::kSeed = 42; // fixed se template <typename T, typename U> Client<T,U>::Client(const Routine run_routine, const Reference1 run_reference1, const Reference2 run_reference2, - const std::vector<std::string> &options, + const Reference3 run_reference3, const std::vector<std::string> &options, const std::vector<std::string> &buffers_in, const std::vector<std::string> &buffers_out, const GetMetric get_flops, const GetMetric get_bytes): run_routine_(run_routine), run_reference1_(run_reference1), run_reference2_(run_reference2), + run_reference3_(run_reference3), options_(options), buffers_in_(buffers_in), buffers_out_(buffers_out), @@ -119,6 +120,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le #else args.compare_cblas = 0; #endif + #ifdef CLBLAST_REF_CUBLAS + args.compare_cublas = GetArgument(command_line_args, help, kArgComparecublas, 1); + #else + args.compare_cublas = 0; + #endif args.step = GetArgument(command_line_args, help, kArgStepSize, size_t{1}); args.num_steps = GetArgument(command_line_args, help, kArgNumSteps, size_t{0}); args.num_runs = GetArgument(command_line_args, help, kArgNumRuns, size_t{10}); @@ -133,24 +139,26 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le // Comparison against a non-BLAS routine is not supported if (level == 4) { // level-4 == level-X - if (args.compare_clblas != 0 || args.compare_cblas != 0) { + if (args.compare_clblas != 0 || args.compare_cblas != 0 || args.compare_cublas != 0) { if (!args.silent) { - fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for this non-BLAS routine\n\n"); + fprintf(stdout, "* Disabling clBLAS/CBLAS/cuBLAS comparisons for this non-BLAS routine\n\n"); } } args.compare_clblas = 0; args.compare_cblas = 0; + args.compare_cublas = 0; } - // Comparison against clBLAS or a CPU BLAS library is not supported in case of half-precision + // Comparison against other BLAS libraries is not supported in case of half-precision if (args.precision == Precision::kHalf) { - if (args.compare_clblas != 0 || args.compare_cblas != 0) { + if (args.compare_clblas != 0 || args.compare_cblas != 0 || args.compare_cublas != 0) { if (!args.silent) { - fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for half-precision\n\n"); + fprintf(stdout, "* Disabling clBLAS/CBLAS/cuBLAS comparisons for half-precision\n\n"); } } args.compare_clblas = 0; args.compare_cblas = 0; + args.compare_cublas = 0; } // Returns the arguments @@ -174,6 +182,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) #ifdef CLBLAST_REF_CLBLAS if (args.compare_clblas) { clblasSetup(); } #endif + #ifdef CLBLAST_REF_CUBLAS + if (args.compare_cublas) { cublasSetup(args); } + #endif // Iterates over all "num_step" values jumping by "step" each time auto s = size_t{0}; @@ -232,6 +243,16 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) HostToDevice(args, buffers, buffers_host, queue, buffers_out_); timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas)); } + if (args.compare_cublas) { + auto buffers_host = BuffersHost<T>(); + auto buffers_cuda = BuffersCUDA<T>(); + DeviceToHost(args, buffers, buffers_host, queue, buffers_in_); + HostToCUDA(args, buffers_cuda, buffers_host, buffers_in_); + auto ms_cublas = TimedExecution(args.num_runs, args, buffers_cuda, queue, run_reference3_, "cuBLAS"); + CUDAToHost(args, buffers_cuda, buffers_host, buffers_out_); + HostToDevice(args, buffers, buffers_host, queue, buffers_out_); + timings.push_back(std::pair<std::string, double>("cuBLAS", ms_cublas)); + } // Prints the performance of the tested libraries PrintTableRow(args, timings); @@ -251,6 +272,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) #ifdef CLBLAST_REF_CLBLAS if (args.compare_clblas) { clblasTeardown(); } #endif + #ifdef CLBLAST_REF_CUBLAS + if (args.compare_cublas) { cublasTeardown(args); } + #endif } // ================================================================================================= @@ -307,6 +331,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) { fprintf(stdout, " | <-- CLBlast -->"); if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); } if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); } + if (args.compare_cublas) { fprintf(stdout, " | <-- cuBLAS -->"); } fprintf(stdout, " |\n"); } @@ -315,6 +340,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) { fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1"); if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); } if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); } + if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_4", "GFLOPS_4", "GBs_4"); } fprintf(stdout, "\n"); } diff --git a/test/performance/client.hpp b/test/performance/client.hpp index 12fd113d..47a13017 100644 --- a/test/performance/client.hpp +++ b/test/performance/client.hpp @@ -31,6 +31,7 @@ #ifdef CLBLAST_REF_CLBLAS #include <clBLAS.h> #endif +#include "test/wrapper_cuda.hpp" #include "clblast.h" namespace clblast { @@ -46,12 +47,13 @@ class Client { using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>; using Reference1 = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>; using Reference2 = std::function<StatusCode(const Arguments<U>&, BuffersHost<T>&, Queue&)>; + using Reference3 = std::function<StatusCode(const Arguments<U>&, BuffersCUDA<T>&, Queue&)>; using SetMetric = std::function<void(Arguments<U>&)>; using GetMetric = std::function<size_t(const Arguments<U>&)>; // The constructor Client(const Routine run_routine, const Reference1 run_reference1, const Reference2 run_reference2, - const std::vector<std::string> &options, + const Reference3 run_reference3, const std::vector<std::string> &options, const std::vector<std::string> &buffers_in, const std::vector<std::string> &buffers_out, const GetMetric get_flops, const GetMetric get_bytes); @@ -84,6 +86,7 @@ class Client { const Routine run_routine_; const Reference1 run_reference1_; const Reference2 run_reference2_; + const Reference3 run_reference3_; const std::vector<std::string> options_; const std::vector<std::string> buffers_in_; const std::vector<std::string> buffers_out_; @@ -118,9 +121,14 @@ void RunClient(int argc, char *argv[]) { #else auto reference2 = ReferenceNotAvailable<T,U,BuffersHost<T>>; #endif + #ifdef CLBLAST_REF_CUBLAS + auto reference3 = C::RunReference3; // cuBLAS when available + #else + auto reference3 = ReferenceNotAvailable<T,U,BuffersCUDA<T>>; + #endif // Creates a new client - auto client = Client<T,U>(C::RunRoutine, reference1, reference2, C::GetOptions(), + auto client = Client<T,U>(C::RunRoutine, reference1, reference2, reference3, C::GetOptions(), C::BuffersIn(), C::BuffersOut(), C::GetFlops, C::GetBytes); // Simple command line argument parser with defaults diff --git a/test/performance/routines/level1/xamax.cpp b/test/performance/routines/level1/xamax.cpp index 5dc7b3d9..5cbef604 100644 --- a/test/performance/routines/level1/xamax.cpp +++ b/test/performance/routines/level1/xamax.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xamax.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXamax<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXamax<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXamax<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXamax<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXamax<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level1/xasum.cpp b/test/performance/routines/level1/xasum.cpp index bf5b2fa9..7fccb678 100644 --- a/test/performance/routines/level1/xasum.cpp +++ b/test/performance/routines/level1/xasum.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xasum.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXasum<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXasum<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXasum<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXasum<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXasum<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level1/xaxpy.cpp b/test/performance/routines/level1/xaxpy.cpp index faccc089..739408bb 100644 --- a/test/performance/routines/level1/xaxpy.cpp +++ b/test/performance/routines/level1/xaxpy.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xaxpy.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXaxpy<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXaxpy<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXaxpy<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXaxpy<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXaxpy<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level1/xcopy.cpp b/test/performance/routines/level1/xcopy.cpp index 8aa536af..902c394f 100644 --- a/test/performance/routines/level1/xcopy.cpp +++ b/test/performance/routines/level1/xcopy.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xcopy.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXcopy<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXcopy<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXcopy<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXcopy<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXcopy<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level1/xdot.cpp b/test/performance/routines/level1/xdot.cpp index 9a570e1e..b2d4d969 100644 --- a/test/performance/routines/level1/xdot.cpp +++ b/test/performance/routines/level1/xdot.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xdot.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level1/xdotc.cpp b/test/performance/routines/level1/xdotc.cpp index 426b81ae..308bcdab 100644 --- a/test/performance/routines/level1/xdotc.cpp +++ b/test/performance/routines/level1/xdotc.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xdotc.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXdotc<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXdotc<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXdotc<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXdotc<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level1/xdotu.cpp b/test/performance/routines/level1/xdotu.cpp index 4fbe167d..fc54a8a5 100644 --- a/test/performance/routines/level1/xdotu.cpp +++ b/test/performance/routines/level1/xdotu.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xdotu.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXdotu<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXdotu<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXdotu<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXdotu<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level1/xnrm2.cpp b/test/performance/routines/level1/xnrm2.cpp index 6a1cdcc7..769335eb 100644 --- a/test/performance/routines/level1/xnrm2.cpp +++ b/test/performance/routines/level1/xnrm2.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xnrm2.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXnrm2<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXnrm2<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXnrm2<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXnrm2<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXnrm2<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level1/xrot.cpp b/test/performance/routines/level1/xrot.cpp index 2b94ca39..f010e04a 100644 --- a/test/performance/routines/level1/xrot.cpp +++ b/test/performance/routines/level1/xrot.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xrot.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level1/xrotg.cpp b/test/performance/routines/level1/xrotg.cpp index ee6fc44b..4c8d33cf 100644 --- a/test/performance/routines/level1/xrotg.cpp +++ b/test/performance/routines/level1/xrotg.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xrotg.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level1/xrotm.cpp b/test/performance/routines/level1/xrotm.cpp index e8d73311..bc2111b3 100644 --- a/test/performance/routines/level1/xrotm.cpp +++ b/test/performance/routines/level1/xrotm.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xrotm.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level1/xrotmg.cpp b/test/performance/routines/level1/xrotmg.cpp index a5266b14..fb568243 100644 --- a/test/performance/routines/level1/xrotmg.cpp +++ b/test/performance/routines/level1/xrotmg.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xrotmg.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level1/xscal.cpp b/test/performance/routines/level1/xscal.cpp index 6fefc5d0..b9db60cf 100644 --- a/test/performance/routines/level1/xscal.cpp +++ b/test/performance/routines/level1/xscal.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xscal.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXscal<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXscal<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXscal<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXscal<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXscal<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level1/xswap.cpp b/test/performance/routines/level1/xswap.cpp index b728b8f4..db40f6e4 100644 --- a/test/performance/routines/level1/xswap.cpp +++ b/test/performance/routines/level1/xswap.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level1/xswap.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXswap<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXswap<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXswap<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXswap<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXswap<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xgbmv.cpp b/test/performance/routines/level2/xgbmv.cpp index 6a4b01f8..23a91503 100644 --- a/test/performance/routines/level2/xgbmv.cpp +++ b/test/performance/routines/level2/xgbmv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xgbmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXgbmv<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXgbmv<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXgbmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXgbmv<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXgbmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xgemv.cpp b/test/performance/routines/level2/xgemv.cpp index 335d5ef1..3bb14b68 100644 --- a/test/performance/routines/level2/xgemv.cpp +++ b/test/performance/routines/level2/xgemv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xgemv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXgemv<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXgemv<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXgemv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXgemv<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXgemv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xger.cpp b/test/performance/routines/level2/xger.cpp index 50fdb9e6..ca23b8f0 100644 --- a/test/performance/routines/level2/xger.cpp +++ b/test/performance/routines/level2/xger.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xger.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level2/xgerc.cpp b/test/performance/routines/level2/xgerc.cpp index 67c72285..0423cdd5 100644 --- a/test/performance/routines/level2/xgerc.cpp +++ b/test/performance/routines/level2/xgerc.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xgerc.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXgerc<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXgerc<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXgerc<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXgerc<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xgeru.cpp b/test/performance/routines/level2/xgeru.cpp index 6e845bb8..c0fbb2d5 100644 --- a/test/performance/routines/level2/xgeru.cpp +++ b/test/performance/routines/level2/xgeru.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xgeru.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXgeru<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXgeru<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXgeru<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXgeru<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xhbmv.cpp b/test/performance/routines/level2/xhbmv.cpp index 600317c1..d59cba26 100644 --- a/test/performance/routines/level2/xhbmv.cpp +++ b/test/performance/routines/level2/xhbmv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xhbmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXhbmv<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXhbmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXhbmv<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXhbmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xhemv.cpp b/test/performance/routines/level2/xhemv.cpp index 7700cf7b..1664b6cd 100644 --- a/test/performance/routines/level2/xhemv.cpp +++ b/test/performance/routines/level2/xhemv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xhemv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXhemv<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXhemv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXhemv<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXhemv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xher.cpp b/test/performance/routines/level2/xher.cpp index e7276aee..434f486c 100644 --- a/test/performance/routines/level2/xher.cpp +++ b/test/performance/routines/level2/xher.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xher.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXher<float2,float>, float2, float>(argc, argv); break; + clblast::RunClient<clblast::TestXher<clblast::float2,float>, clblast::float2, float>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXher<double2,double>, double2, double>(argc, argv); break; + clblast::RunClient<clblast::TestXher<clblast::double2,double>, clblast::double2, double>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xher2.cpp b/test/performance/routines/level2/xher2.cpp index b4c53206..cce40a9e 100644 --- a/test/performance/routines/level2/xher2.cpp +++ b/test/performance/routines/level2/xher2.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xher2.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXher2<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXher2<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXher2<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXher2<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xhpmv.cpp b/test/performance/routines/level2/xhpmv.cpp index d9683d2e..d88791fe 100644 --- a/test/performance/routines/level2/xhpmv.cpp +++ b/test/performance/routines/level2/xhpmv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xhpmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXhpmv<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXhpmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXhpmv<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXhpmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xhpr.cpp b/test/performance/routines/level2/xhpr.cpp index c4ffaf81..a92a3134 100644 --- a/test/performance/routines/level2/xhpr.cpp +++ b/test/performance/routines/level2/xhpr.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xhpr.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXhpr<float2,float>, float2, float>(argc, argv); break; + clblast::RunClient<clblast::TestXhpr<clblast::float2,float>, clblast::float2, float>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXhpr<double2,double>, double2, double>(argc, argv); break; + clblast::RunClient<clblast::TestXhpr<clblast::double2,double>, clblast::double2, double>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xhpr2.cpp b/test/performance/routines/level2/xhpr2.cpp index 3e5d4004..f34de29b 100644 --- a/test/performance/routines/level2/xhpr2.cpp +++ b/test/performance/routines/level2/xhpr2.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xhpr2.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXhpr2<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXhpr2<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXhpr2<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXhpr2<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xsbmv.cpp b/test/performance/routines/level2/xsbmv.cpp index 9c0ab3b6..59bbf40c 100644 --- a/test/performance/routines/level2/xsbmv.cpp +++ b/test/performance/routines/level2/xsbmv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xsbmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level2/xspmv.cpp b/test/performance/routines/level2/xspmv.cpp index 6cc4e3ba..9ba29f43 100644 --- a/test/performance/routines/level2/xspmv.cpp +++ b/test/performance/routines/level2/xspmv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xspmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level2/xspr.cpp b/test/performance/routines/level2/xspr.cpp index dc45ba6d..57551f5d 100644 --- a/test/performance/routines/level2/xspr.cpp +++ b/test/performance/routines/level2/xspr.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xspr.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level2/xspr2.cpp b/test/performance/routines/level2/xspr2.cpp index 3c9a769f..573fb652 100644 --- a/test/performance/routines/level2/xspr2.cpp +++ b/test/performance/routines/level2/xspr2.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xspr2.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level2/xsymv.cpp b/test/performance/routines/level2/xsymv.cpp index aaa98c8b..25933d8d 100644 --- a/test/performance/routines/level2/xsymv.cpp +++ b/test/performance/routines/level2/xsymv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xsymv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level2/xsyr.cpp b/test/performance/routines/level2/xsyr.cpp index d710bf63..3b54510d 100644 --- a/test/performance/routines/level2/xsyr.cpp +++ b/test/performance/routines/level2/xsyr.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xsyr.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level2/xsyr2.cpp b/test/performance/routines/level2/xsyr2.cpp index 39b46b6a..ab9641c2 100644 --- a/test/performance/routines/level2/xsyr2.cpp +++ b/test/performance/routines/level2/xsyr2.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xsyr2.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); diff --git a/test/performance/routines/level2/xtbmv.cpp b/test/performance/routines/level2/xtbmv.cpp index 5fb3ea14..319f9c80 100644 --- a/test/performance/routines/level2/xtbmv.cpp +++ b/test/performance/routines/level2/xtbmv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xtbmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXtbmv<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXtbmv<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXtbmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXtbmv<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXtbmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xtbsv.cpp b/test/performance/routines/level2/xtbsv.cpp index 7b88917c..4d37e76d 100644 --- a/test/performance/routines/level2/xtbsv.cpp +++ b/test/performance/routines/level2/xtbsv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xtbsv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -26,9 +22,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXtbsv<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXtbsv<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXtbsv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXtbsv<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXtbsv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xtpmv.cpp b/test/performance/routines/level2/xtpmv.cpp index 907749a7..c2db51b1 100644 --- a/test/performance/routines/level2/xtpmv.cpp +++ b/test/performance/routines/level2/xtpmv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xtpmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXtpmv<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXtpmv<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXtpmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXtpmv<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXtpmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xtpsv.cpp b/test/performance/routines/level2/xtpsv.cpp index 0dab8ff6..b01a9f05 100644 --- a/test/performance/routines/level2/xtpsv.cpp +++ b/test/performance/routines/level2/xtpsv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xtpsv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -26,9 +22,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXtpsv<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXtpsv<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXtpsv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXtpsv<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXtpsv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xtrmv.cpp b/test/performance/routines/level2/xtrmv.cpp index c2c6f232..610a5052 100644 --- a/test/performance/routines/level2/xtrmv.cpp +++ b/test/performance/routines/level2/xtrmv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xtrmv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXtrmv<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXtrmv<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXtrmv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXtrmv<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXtrmv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level2/xtrsv.cpp b/test/performance/routines/level2/xtrsv.cpp index 49e477f7..02255e71 100644 --- a/test/performance/routines/level2/xtrsv.cpp +++ b/test/performance/routines/level2/xtrsv.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level2/xtrsv.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -26,9 +22,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXtrsv<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXtrsv<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXtrsv<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXtrsv<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXtrsv<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level3/xgemm.cpp b/test/performance/routines/level3/xgemm.cpp index deb2493f..602e1a20 100644 --- a/test/performance/routines/level3/xgemm.cpp +++ b/test/performance/routines/level3/xgemm.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level3/xgemm.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXgemm<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXgemm<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXgemm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXgemm<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXgemm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level3/xhemm.cpp b/test/performance/routines/level3/xhemm.cpp index 975c672f..6c3687a9 100644 --- a/test/performance/routines/level3/xhemm.cpp +++ b/test/performance/routines/level3/xhemm.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level3/xhemm.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXhemm<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXhemm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXhemm<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXhemm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level3/xher2k.cpp b/test/performance/routines/level3/xher2k.cpp index d579d4f9..9d3385f7 100644 --- a/test/performance/routines/level3/xher2k.cpp +++ b/test/performance/routines/level3/xher2k.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level3/xher2k.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXher2k<float2,float>, float2, float>(argc, argv); break; + clblast::RunClient<clblast::TestXher2k<clblast::float2,float>, clblast::float2, float>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXher2k<double2,double>, double2, double>(argc, argv); break; + clblast::RunClient<clblast::TestXher2k<clblast::double2,double>, clblast::double2, double>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level3/xherk.cpp b/test/performance/routines/level3/xherk.cpp index 94411e5a..ae6e774e 100644 --- a/test/performance/routines/level3/xherk.cpp +++ b/test/performance/routines/level3/xherk.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level3/xherk.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -24,9 +20,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXherk<float2,float>, float2, float>(argc, argv); break; + clblast::RunClient<clblast::TestXherk<clblast::float2,float>, clblast::float2, float>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXherk<double2,double>, double2, double>(argc, argv); break; + clblast::RunClient<clblast::TestXherk<clblast::double2,double>, clblast::double2, double>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level3/xsymm.cpp b/test/performance/routines/level3/xsymm.cpp index 04ae8eb0..ba3b6ab2 100644 --- a/test/performance/routines/level3/xsymm.cpp +++ b/test/performance/routines/level3/xsymm.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level3/xsymm.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXsymm<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXsymm<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXsymm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXsymm<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXsymm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level3/xsyr2k.cpp b/test/performance/routines/level3/xsyr2k.cpp index 7b8b6f4f..150a4192 100644 --- a/test/performance/routines/level3/xsyr2k.cpp +++ b/test/performance/routines/level3/xsyr2k.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level3/xsyr2k.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXsyr2k<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXsyr2k<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXsyr2k<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXsyr2k<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXsyr2k<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level3/xsyrk.cpp b/test/performance/routines/level3/xsyrk.cpp index ea0fc33b..00cef52b 100644 --- a/test/performance/routines/level3/xsyrk.cpp +++ b/test/performance/routines/level3/xsyrk.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level3/xsyrk.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXsyrk<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXsyrk<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXsyrk<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXsyrk<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXsyrk<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level3/xtrmm.cpp b/test/performance/routines/level3/xtrmm.cpp index 7a29e111..fb54a410 100644 --- a/test/performance/routines/level3/xtrmm.cpp +++ b/test/performance/routines/level3/xtrmm.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level3/xtrmm.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXtrmm<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXtrmm<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXtrmm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXtrmm<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXtrmm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/level3/xtrsm.cpp b/test/performance/routines/level3/xtrsm.cpp index 342274b7..f44265f2 100644 --- a/test/performance/routines/level3/xtrsm.cpp +++ b/test/performance/routines/level3/xtrsm.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/level3/xtrsm.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -26,9 +22,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXtrsm<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXtrsm<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXtrsm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXtrsm<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXtrsm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/levelx/xaxpybatched.cpp b/test/performance/routines/levelx/xaxpybatched.cpp index 6d3bcb51..7c09cd5b 100644 --- a/test/performance/routines/levelx/xaxpybatched.cpp +++ b/test/performance/routines/levelx/xaxpybatched.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/levelx/xaxpybatched.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXaxpyBatched<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXaxpyBatched<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXaxpyBatched<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXaxpyBatched<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXaxpyBatched<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/levelx/xgemmbatched.cpp b/test/performance/routines/levelx/xgemmbatched.cpp index c9477fad..f4c860d8 100644 --- a/test/performance/routines/levelx/xgemmbatched.cpp +++ b/test/performance/routines/levelx/xgemmbatched.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/levelx/xgemmbatched.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXgemmBatched<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXgemmBatched<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXgemmBatched<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXgemmBatched<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXgemmBatched<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/performance/routines/levelx/xomatcopy.cpp b/test/performance/routines/levelx/xomatcopy.cpp index 5821c3b8..568f22e6 100644 --- a/test/performance/routines/levelx/xomatcopy.cpp +++ b/test/performance/routines/levelx/xomatcopy.cpp @@ -12,10 +12,6 @@ #include "test/performance/client.hpp" #include "test/routines/levelx/xomatcopy.hpp" -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); @@ -27,9 +23,9 @@ int main(int argc, char *argv[]) { case clblast::Precision::kDouble: clblast::RunClient<clblast::TestXomatcopy<double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXomatcopy<float2>, float2, float2>(argc, argv); break; + clblast::RunClient<clblast::TestXomatcopy<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXomatcopy<double2>, double2, double2>(argc, argv); break; + clblast::RunClient<clblast::TestXomatcopy<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/routines/common.hpp b/test/routines/common.hpp new file mode 100644 index 00000000..9708288a --- /dev/null +++ b/test/routines/common.hpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains all the common includes for the clients and tests +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_COMMON_H_ +#define CLBLAST_TEST_ROUTINES_COMMON_H_ + +#include <vector> +#include <string> + +#include "utilities/utilities.hpp" + +#ifdef CLBLAST_REF_CLBLAS + #include "test/wrapper_clblas.hpp" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "test/wrapper_cblas.hpp" +#endif +#include "test/wrapper_cuda.hpp" +#ifdef CLBLAST_REF_CUBLAS + #include "test/wrapper_cublas.hpp" +#endif + +// ================================================================================================= + +// CLBLAST_TEST_ROUTINES_COMMON_H_ +#endif diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp index 2e844f2c..04bdaa3d 100644 --- a/test/routines/level1/xamax.hpp +++ b/test/routines/level1/xamax.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XAMAX_H_ #define CLBLAST_TEST_ROUTINES_XAMAX_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -111,6 +103,16 @@ class TestXamax { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXamax(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, + buffers.scalar, args.imax_offset, + buffers.x_vec, args.x_offset, args.x_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.scalar_size, static_cast<T>(0)); diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp index 8488bfc6..6add9c64 100644 --- a/test/routines/level1/xasum.hpp +++ b/test/routines/level1/xasum.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XASUM_H_ #define CLBLAST_TEST_ROUTINES_XASUM_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -111,6 +103,16 @@ class TestXasum { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXasum(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, + buffers.scalar, args.asum_offset, + buffers.x_vec, args.x_offset, args.x_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.scalar_size, static_cast<T>(0)); diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp index cc7d251a..17cae6ad 100644 --- a/test/routines/level1/xaxpy.hpp +++ b/test/routines/level1/xaxpy.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XAXPY_H_ #define CLBLAST_TEST_ROUTINES_XAXPY_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -112,6 +104,16 @@ class TestXaxpy { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXaxpy(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.y_size, static_cast<T>(0)); diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp index 0dbf0f3d..7a5c99b8 100644 --- a/test/routines/level1/xcopy.hpp +++ b/test/routines/level1/xcopy.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XCOPY_H_ #define CLBLAST_TEST_ROUTINES_XCOPY_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -111,6 +103,16 @@ class TestXcopy { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXcopy(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.y_size, static_cast<T>(0)); diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp index bdf2e721..1ea25994 100644 --- a/test/routines/level1/xdot.hpp +++ b/test/routines/level1/xdot.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XDOT_H_ #define CLBLAST_TEST_ROUTINES_XDOT_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -118,6 +110,17 @@ class TestXdot { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXdot(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.scalar_size, static_cast<T>(0)); diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp index 2cc71b93..c800c1f5 100644 --- a/test/routines/level1/xdotc.hpp +++ b/test/routines/level1/xdotc.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XDOTC_H_ #define CLBLAST_TEST_ROUTINES_XDOTC_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -118,6 +110,17 @@ class TestXdotc { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXdotc(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.scalar_size, static_cast<T>(0)); diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp index 272e1e31..3545a3a6 100644 --- a/test/routines/level1/xdotu.hpp +++ b/test/routines/level1/xdotu.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XDOTU_H_ #define CLBLAST_TEST_ROUTINES_XDOTU_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -118,6 +110,17 @@ class TestXdotu { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXdotu(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.scalar_size, static_cast<T>(0)); diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp index cb1ec683..1db70537 100644 --- a/test/routines/level1/xnrm2.hpp +++ b/test/routines/level1/xnrm2.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XNRM2_H_ #define CLBLAST_TEST_ROUTINES_XNRM2_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -111,6 +103,16 @@ class TestXnrm2 { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXnrm2(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, + buffers.scalar, args.nrm2_offset, + buffers.x_vec, args.x_offset, args.x_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.scalar_size, static_cast<T>(0)); diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp index 3e6b9a38..efa0988d 100644 --- a/test/routines/level1/xscal.hpp +++ b/test/routines/level1/xscal.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XSCAL_H_ #define CLBLAST_TEST_ROUTINES_XSCAL_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -105,6 +97,15 @@ class TestXscal { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXscal(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.x_size, static_cast<T>(0)); diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp index d9b84dc4..d778cc23 100644 --- a/test/routines/level1/xswap.hpp +++ b/test/routines/level1/xswap.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XSWAP_H_ #define CLBLAST_TEST_ROUTINES_XSWAP_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -111,6 +103,16 @@ class TestXswap { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXswap(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.x_size + args.y_size, static_cast<T>(0)); diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp index 990ef49f..23138c77 100644 --- a/test/routines/level2/xgbmv.hpp +++ b/test/routines/level2/xgbmv.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XGBMV_H_ #define CLBLAST_TEST_ROUTINES_XGBMV_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -131,6 +123,19 @@ class TestXgbmv { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXgbmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.a_transpose), + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.y_size, static_cast<T>(0)); diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp index a007cb62..0ee53b80 100644 --- a/test/routines/level2/xgemv.hpp +++ b/test/routines/level2/xgemv.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XGEMV_H_ #define CLBLAST_TEST_ROUTINES_XGEMV_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -131,6 +123,19 @@ class TestXgemv { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXgemv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.a_transpose), + args.m, args.n, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.y_size, static_cast<T>(0)); diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp index 5c131e2d..92a1a2ae 100644 --- a/test/routines/level2/xger.hpp +++ b/test/routines/level2/xger.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XGER_H_ #define CLBLAST_TEST_ROUTINES_XGER_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,18 @@ class TestXger { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXger(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + args.m, args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.a_size, static_cast<T>(0)); diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp index e3544424..5d899398 100644 --- a/test/routines/level2/xgerc.hpp +++ b/test/routines/level2/xgerc.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XGERC_H_ #define CLBLAST_TEST_ROUTINES_XGERC_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,18 @@ class TestXgerc { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXgerc(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + args.m, args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.a_size, static_cast<T>(0)); diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp index 1d81e292..96dab22e 100644 --- a/test/routines/level2/xgeru.hpp +++ b/test/routines/level2/xgeru.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XGERU_H_ #define CLBLAST_TEST_ROUTINES_XGERU_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,18 @@ class TestXgeru { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXgeru(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + args.m, args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.a_size, static_cast<T>(0)); diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp index 21194fd6..b6844744 100644 --- a/test/routines/level2/xhbmv.hpp +++ b/test/routines/level2/xhbmv.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XHBMV_H_ #define CLBLAST_TEST_ROUTINES_XHBMV_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,19 @@ class TestXhbmv { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXhbmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.kl, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.y_size, static_cast<T>(0)); diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp index ffef8ff8..e1f23592 100644 --- a/test/routines/level2/xhemv.hpp +++ b/test/routines/level2/xhemv.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XHEMV_H_ #define CLBLAST_TEST_ROUTINES_XHEMV_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,19 @@ class TestXhemv { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXhemv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.y_size, static_cast<T>(0)); diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp index 083bd3fc..1ac1247b 100644 --- a/test/routines/level2/xher.hpp +++ b/test/routines/level2/xher.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XHER_H_ #define CLBLAST_TEST_ROUTINES_XHER_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -118,6 +110,18 @@ class TestXher { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<U> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXher(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.a_size, static_cast<T>(0)); diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp index 7bd890a5..18ccc1ac 100644 --- a/test/routines/level2/xher2.hpp +++ b/test/routines/level2/xher2.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XHER2_H_ #define CLBLAST_TEST_ROUTINES_XHER2_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,19 @@ class TestXher2 { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXher2(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.a_size, static_cast<T>(0)); diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp index 285dd6d3..ad91fe15 100644 --- a/test/routines/level2/xhpmv.hpp +++ b/test/routines/level2/xhpmv.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XHPMV_H_ #define CLBLAST_TEST_ROUTINES_XHPMV_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,19 @@ class TestXhpmv { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXhpmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.alpha, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.y_size, static_cast<T>(0)); diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp index 88bae86b..f9d580cd 100644 --- a/test/routines/level2/xhpr.hpp +++ b/test/routines/level2/xhpr.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XHPR_H_ #define CLBLAST_TEST_ROUTINES_XHPR_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -118,6 +110,18 @@ class TestXhpr { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<U> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXhpr(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.ap_size, static_cast<T>(0)); diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp index cd10fa00..f946ba5c 100644 --- a/test/routines/level2/xhpr2.hpp +++ b/test/routines/level2/xhpr2.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XHPR2_H_ #define CLBLAST_TEST_ROUTINES_XHPR2_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,19 @@ class TestXhpr2 { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXhpr2(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.ap_size, static_cast<T>(0)); diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp index 5c70aba5..6481d19b 100644 --- a/test/routines/level2/xsbmv.hpp +++ b/test/routines/level2/xsbmv.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XSBMV_H_ #define CLBLAST_TEST_ROUTINES_XSBMV_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,19 @@ class TestXsbmv { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXsbmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.kl, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.y_size, static_cast<T>(0)); diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp index 560f5baa..9815dbee 100644 --- a/test/routines/level2/xspmv.hpp +++ b/test/routines/level2/xspmv.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XSPMV_H_ #define CLBLAST_TEST_ROUTINES_XSPMV_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,19 @@ class TestXspmv { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXspmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.alpha, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.y_size, static_cast<T>(0)); diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp index 2e12db33..01a50c38 100644 --- a/test/routines/level2/xspr.hpp +++ b/test/routines/level2/xspr.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XSPR_H_ #define CLBLAST_TEST_ROUTINES_XSPR_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -118,6 +110,18 @@ class TestXspr { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXspr(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.ap_size, static_cast<T>(0)); diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp index a7e22227..55f8a141 100644 --- a/test/routines/level2/xspr2.hpp +++ b/test/routines/level2/xspr2.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XSPR2_H_ #define CLBLAST_TEST_ROUTINES_XSPR2_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,19 @@ class TestXspr2 { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXspr2(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.ap_size, static_cast<T>(0)); diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp index d9cf9c1e..aec0dfb0 100644 --- a/test/routines/level2/xsymv.hpp +++ b/test/routines/level2/xsymv.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XSYMV_H_ #define CLBLAST_TEST_ROUTINES_XSYMV_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,19 @@ class TestXsymv { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXsymv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.y_size, static_cast<T>(0)); diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp index b60c3a36..78b686d8 100644 --- a/test/routines/level2/xsyr.hpp +++ b/test/routines/level2/xsyr.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XSYR_H_ #define CLBLAST_TEST_ROUTINES_XSYR_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -118,6 +110,18 @@ class TestXsyr { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXsyr(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.a_size, static_cast<T>(0)); diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp index dd10a3d0..38aa4f43 100644 --- a/test/routines/level2/xsyr2.hpp +++ b/test/routines/level2/xsyr2.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XSYR2_H_ #define CLBLAST_TEST_ROUTINES_XSYR2_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -125,6 +117,19 @@ class TestXsyr2 { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXsyr2(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + args.n, args.alpha, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.a_size, static_cast<T>(0)); diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp index 7eb8ce9e..8c7aa381 100644 --- a/test/routines/level2/xtbmv.hpp +++ b/test/routines/level2/xtbmv.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XTBMV_H_ #define CLBLAST_TEST_ROUTINES_XTBMV_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -121,6 +113,20 @@ class TestXtbmv { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXtbmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + convertToCUBLAS(args.a_transpose), + convertToCUBLAS(args.diagonal), + args.n, args.kl, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.x_size, static_cast<T>(0)); diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp index 7f4842f0..3afab978 100644 --- a/test/routines/level2/xtpmv.hpp +++ b/test/routines/level2/xtpmv.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XTPMV_H_ #define CLBLAST_TEST_ROUTINES_XTPMV_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -121,6 +113,20 @@ class TestXtpmv { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXtpmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + convertToCUBLAS(args.a_transpose), + convertToCUBLAS(args.diagonal), + args.n, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.x_size, static_cast<T>(0)); diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp index cb7527ed..2b71f151 100644 --- a/test/routines/level2/xtrmv.hpp +++ b/test/routines/level2/xtrmv.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XTRMV_H_ #define CLBLAST_TEST_ROUTINES_XTRMV_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -121,6 +113,20 @@ class TestXtrmv { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXtrmv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + convertToCUBLAS(args.a_transpose), + convertToCUBLAS(args.diagonal), + args.n, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.x_size, static_cast<T>(0)); diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp index 63d34758..85b50e85 100644 --- a/test/routines/level2/xtrsv.hpp +++ b/test/routines/level2/xtrsv.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XTRSV_H_ #define CLBLAST_TEST_ROUTINES_XTRSV_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -136,6 +128,20 @@ class TestXtrsv { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXtrsv(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + convertToCUBLAS(args.a_transpose), + convertToCUBLAS(args.diagonal), + args.n, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.x_size, static_cast<T>(0)); diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp index a33cbfec..7e0ead6d 100644 --- a/test/routines/level3/xgemm.hpp +++ b/test/routines/level3/xgemm.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XGEMM_H_ #define CLBLAST_TEST_ROUTINES_XGEMM_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -135,6 +127,20 @@ class TestXgemm { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXgemm(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.a_transpose), + convertToCUBLAS(args.b_transpose), + args.m, args.n, args.k, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.c_size, static_cast<T>(0)); diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp index 74029c7e..a89617b5 100644 --- a/test/routines/level3/xhemm.hpp +++ b/test/routines/level3/xhemm.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XHEMM_H_ #define CLBLAST_TEST_ROUTINES_XHEMM_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -135,6 +127,20 @@ class TestXhemm { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXhemm(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.side), + convertToCUBLAS(args.triangle), + args.m, args.n, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.c_size, static_cast<T>(0)); diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp index ea13bbc1..55e6d894 100644 --- a/test/routines/level3/xher2k.hpp +++ b/test/routines/level3/xher2k.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XHER2K_H_ #define CLBLAST_TEST_ROUTINES_XHER2K_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -136,6 +128,21 @@ class TestXher2k { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<U> &args, BuffersCUDA<T> &buffers, Queue &) { + auto alpha2 = T{args.alpha, args.alpha}; + auto status = cublasXher2k(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + convertToCUBLAS(args.a_transpose), + args.n, args.k, alpha2, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.c_size, static_cast<T>(0)); diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp index b1ce83e0..3e1e7e02 100644 --- a/test/routines/level3/xherk.hpp +++ b/test/routines/level3/xherk.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XHERK_H_ #define CLBLAST_TEST_ROUTINES_XHERK_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -123,6 +115,19 @@ class TestXherk { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<U> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXherk(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + convertToCUBLAS(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.c_size, static_cast<T>(0)); diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp index 6ab644b8..5d840d40 100644 --- a/test/routines/level3/xsymm.hpp +++ b/test/routines/level3/xsymm.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XSYMM_H_ #define CLBLAST_TEST_ROUTINES_XSYMM_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -135,6 +127,20 @@ class TestXsymm { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXsymm(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.side), + convertToCUBLAS(args.triangle), + args.m, args.n, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.c_size, static_cast<T>(0)); diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp index 1400c4e2..4a4a2f10 100644 --- a/test/routines/level3/xsyr2k.hpp +++ b/test/routines/level3/xsyr2k.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XSYR2K_H_ #define CLBLAST_TEST_ROUTINES_XSYR2K_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -133,6 +125,20 @@ class TestXsyr2k { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXsyr2k(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + convertToCUBLAS(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.c_size, static_cast<T>(0)); diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp index 2df8d6b0..90e46727 100644 --- a/test/routines/level3/xsyrk.hpp +++ b/test/routines/level3/xsyrk.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XSYRK_H_ #define CLBLAST_TEST_ROUTINES_XSYRK_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -123,6 +115,19 @@ class TestXsyrk { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXsyrk(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.triangle), + convertToCUBLAS(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.c_size, static_cast<T>(0)); diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp index 84adc6e0..acc00e01 100644 --- a/test/routines/level3/xtrmm.hpp +++ b/test/routines/level3/xtrmm.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XTRMM_H_ #define CLBLAST_TEST_ROUTINES_XTRMM_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -127,6 +119,21 @@ class TestXtrmm { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXtrmm(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.side), + convertToCUBLAS(args.triangle), + convertToCUBLAS(args.a_transpose), + convertToCUBLAS(args.diagonal), + args.m, args.n, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.b_size, static_cast<T>(0)); diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp index de5b307d..d63c9d79 100644 --- a/test/routines/level3/xtrsm.hpp +++ b/test/routines/level3/xtrsm.hpp @@ -16,18 +16,9 @@ #ifndef CLBLAST_TEST_ROUTINES_XTRSM_H_ #define CLBLAST_TEST_ROUTINES_XTRSM_H_ -#include <vector> -#include <string> - +#include "test/routines/common.hpp" #include "test/routines/level3/xtrsm_data.hpp" -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif - namespace clblast { // ================================================================================================= @@ -139,6 +130,21 @@ class TestXtrsm { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + auto status = cublasXtrsm(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.side), + convertToCUBLAS(args.triangle), + convertToCUBLAS(args.a_transpose), + convertToCUBLAS(args.diagonal), + args.m, args.n, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld); + if (status == CUBLAS_STATUS_SUCCESS) { return StatusCode::kSuccess; } else { return StatusCode::kUnknownError; } + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.b_size, static_cast<T>(0)); diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp index 05141bbb..5385e86e 100644 --- a/test/routines/levelx/xaxpybatched.hpp +++ b/test/routines/levelx/xaxpybatched.hpp @@ -16,17 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_ #define CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_ -#include <vector> -#include <string> - -#include "utilities/utilities.hpp" - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -135,6 +125,19 @@ class TestXaxpyBatched { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { + auto status = cublasXaxpy(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.n, args.alphas[batch], + buffers.x_vec, args.x_offsets[batch], args.x_inc, + buffers.y_vec, args.y_offsets[batch], args.y_inc); + if (status != CUBLAS_STATUS_SUCCESS) { return StatusCode::kUnknownError; } + } + return StatusCode::kSuccess; + } + #endif + // Describes how to download the results of the computation static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.y_size, static_cast<T>(0)); diff --git a/test/routines/levelx/xgemmbatched.hpp b/test/routines/levelx/xgemmbatched.hpp index ab5f20c5..ebfd8b19 100644 --- a/test/routines/levelx/xgemmbatched.hpp +++ b/test/routines/levelx/xgemmbatched.hpp @@ -16,15 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XGEMMBATCHED_H_ #define CLBLAST_TEST_ROUTINES_XGEMMBATCHED_H_ -#include <vector> -#include <string> - -#ifdef CLBLAST_REF_CLBLAS - #include "test/wrapper_clblas.hpp" -#endif -#ifdef CLBLAST_REF_CBLAS - #include "test/wrapper_cblas.hpp" -#endif +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -168,6 +160,23 @@ class TestXgemmBatched { } #endif + // Describes how to run the cuBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CUBLAS + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { + auto status = cublasXgemm(reinterpret_cast<cublasHandle_t>(args.cublas_handle), args.layout, + convertToCUBLAS(args.a_transpose), + convertToCUBLAS(args.b_transpose), + args.m, args.n, args.k, args.alphas[batch], + buffers.a_mat, args.a_offsets[batch], args.a_ld, + buffers.b_mat, args.b_offsets[batch], args.b_ld, args.betas[batch], + buffers.c_mat, args.c_offsets[batch], args.c_ld); + if (status != CUBLAS_STATUS_SUCCESS) { return StatusCode::kUnknownError; } + } + return StatusCode::kSuccess; + } + #endif + // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { std::vector<T> result(args.c_size, static_cast<T>(0)); diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp index ffb484b0..cc02a88b 100644 --- a/test/routines/levelx/xinvert.hpp +++ b/test/routines/levelx/xinvert.hpp @@ -16,10 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XINVERT_H_ #define CLBLAST_TEST_ROUTINES_XINVERT_H_ -#include <vector> -#include <string> - -#include "utilities/utilities.hpp" +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -192,6 +189,9 @@ class TestXinvert { static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue&) { return RunReference(args, buffers_host); } + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + return StatusCode::kUnknownError; + } // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp index d5973b4c..bbf6006c 100644 --- a/test/routines/levelx/xomatcopy.hpp +++ b/test/routines/levelx/xomatcopy.hpp @@ -16,8 +16,7 @@ #ifndef CLBLAST_TEST_ROUTINES_XOMATCOPY_H_ #define CLBLAST_TEST_ROUTINES_XOMATCOPY_H_ -#include <vector> -#include <string> +#include "test/routines/common.hpp" namespace clblast { // ================================================================================================= @@ -151,6 +150,9 @@ class TestXomatcopy { static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue&) { return RunReference(args, buffers_host); } + static StatusCode RunReference3(const Arguments<T> &args, BuffersCUDA<T> &buffers, Queue &) { + return StatusCode::kUnknownError; + } // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/wrapper_cblas.hpp b/test/wrapper_cblas.hpp index dd610a6c..070d44b5 100644 --- a/test/wrapper_cblas.hpp +++ b/test/wrapper_cblas.hpp @@ -94,7 +94,7 @@ void cblasXrot(const size_t n, std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc, const float cos, const float sin) { - cblas_srot(n, + cblas_srot(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc), cos, @@ -105,7 +105,7 @@ void cblasXrot(const size_t n, std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc, const double cos, const double sin) { - cblas_drot(n, + cblas_drot(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc), cos, @@ -117,7 +117,7 @@ void cblasXrotm(const size_t n, std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc, std::vector<float>& sparam_buffer, const size_t sparam_offset) { - cblas_srotm(n, + cblas_srotm(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc), &sparam_buffer[sparam_offset]); @@ -126,7 +126,7 @@ void cblasXrotm(const size_t n, std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc, std::vector<double>& sparam_buffer, const size_t sparam_offset) { - cblas_drotm(n, + cblas_drotm(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc), &sparam_buffer[sparam_offset]); @@ -136,28 +136,28 @@ void cblasXrotm(const size_t n, void cblasXswap(const size_t n, std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_sswap(n, + cblas_sswap(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc)); } void cblasXswap(const size_t n, std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_dswap(n, + cblas_dswap(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc)); } void cblasXswap(const size_t n, std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_cswap(n, + cblas_cswap(static_cast<int>(n), reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); } void cblasXswap(const size_t n, std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_zswap(n, + cblas_zswap(static_cast<int>(n), reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); } @@ -177,14 +177,14 @@ void cblasXswap(const size_t n, void cblasXscal(const size_t n, const float alpha, std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) { - cblas_sscal(n, + cblas_sscal(static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc)); } void cblasXscal(const size_t n, const double alpha, std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) { - cblas_dscal(n, + cblas_dscal(static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -192,7 +192,7 @@ void cblasXscal(const size_t n, const float2 alpha, std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) { const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; - cblas_cscal(n, + cblas_cscal(static_cast<int>(n), alpha_array.data(), reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -200,7 +200,7 @@ void cblasXscal(const size_t n, const double2 alpha, std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) { const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; - cblas_zscal(n, + cblas_zscal(static_cast<int>(n), alpha_array.data(), reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -218,28 +218,28 @@ void cblasXscal(const size_t n, void cblasXcopy(const size_t n, const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_scopy(n, + cblas_scopy(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc)); } void cblasXcopy(const size_t n, const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_dcopy(n, + cblas_dcopy(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc)); } void cblasXcopy(const size_t n, const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_ccopy(n, + cblas_ccopy(static_cast<int>(n), reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); } void cblasXcopy(const size_t n, const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_zcopy(n, + cblas_zcopy(static_cast<int>(n), reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); } @@ -259,7 +259,7 @@ void cblasXaxpy(const size_t n, const float alpha, const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_saxpy(n, + cblas_saxpy(static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc)); @@ -268,7 +268,7 @@ void cblasXaxpy(const size_t n, const double alpha, const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_daxpy(n, + cblas_daxpy(static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc)); @@ -278,7 +278,7 @@ void cblasXaxpy(const size_t n, const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; - cblas_caxpy(n, + cblas_caxpy(static_cast<int>(n), alpha_array.data(), reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); @@ -288,7 +288,7 @@ void cblasXaxpy(const size_t n, const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) { const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; - cblas_zaxpy(n, + cblas_zaxpy(static_cast<int>(n), alpha_array.data(), reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); @@ -311,7 +311,7 @@ void cblasXdot(const size_t n, std::vector<float>& dot_buffer, const size_t dot_offset, const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) { - dot_buffer[dot_offset] = cblas_sdot(n, + dot_buffer[dot_offset] = cblas_sdot(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc)); } @@ -319,7 +319,7 @@ void cblasXdot(const size_t n, std::vector<double>& dot_buffer, const size_t dot_offset, const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) { - dot_buffer[dot_offset] = cblas_ddot(n, + dot_buffer[dot_offset] = cblas_ddot(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc)); } @@ -342,7 +342,7 @@ void cblasXdotu(const size_t n, std::vector<float2>& dot_buffer, const size_t dot_offset, const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_cdotu_sub(n, + cblas_cdotu_sub(static_cast<int>(n), reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc), reinterpret_cast<return_pointer_float>(&dot_buffer[dot_offset])); @@ -351,7 +351,7 @@ void cblasXdotu(const size_t n, std::vector<double2>& dot_buffer, const size_t dot_offset, const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_zdotu_sub(n, + cblas_zdotu_sub(static_cast<int>(n), reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc), reinterpret_cast<return_pointer_double>(&dot_buffer[dot_offset])); @@ -362,7 +362,7 @@ void cblasXdotc(const size_t n, std::vector<float2>& dot_buffer, const size_t dot_offset, const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_cdotc_sub(n, + cblas_cdotc_sub(static_cast<int>(n), reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc), reinterpret_cast<return_pointer_float>(&dot_buffer[dot_offset])); @@ -371,7 +371,7 @@ void cblasXdotc(const size_t n, std::vector<double2>& dot_buffer, const size_t dot_offset, const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc, const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) { - cblas_zdotc_sub(n, + cblas_zdotc_sub(static_cast<int>(n), reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc), reinterpret_cast<return_pointer_double>(&dot_buffer[dot_offset])); @@ -381,25 +381,25 @@ void cblasXdotc(const size_t n, void cblasXnrm2(const size_t n, std::vector<float>& nrm2_buffer, const size_t nrm2_offset, const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) { - nrm2_buffer[nrm2_offset] = cblas_snrm2(n, + nrm2_buffer[nrm2_offset] = cblas_snrm2(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc)); } void cblasXnrm2(const size_t n, std::vector<double>& nrm2_buffer, const size_t nrm2_offset, const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) { - nrm2_buffer[nrm2_offset] = cblas_dnrm2(n, + nrm2_buffer[nrm2_offset] = cblas_dnrm2(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc)); } void cblasXnrm2(const size_t n, std::vector<float2>& nrm2_buffer, const size_t nrm2_offset, const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) { - nrm2_buffer[nrm2_offset].real(cblas_scnrm2(n, + nrm2_buffer[nrm2_offset].real(cblas_scnrm2(static_cast<int>(n), reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc))); } void cblasXnrm2(const size_t n, std::vector<double2>& nrm2_buffer, const size_t nrm2_offset, const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) { - nrm2_buffer[nrm2_offset].real(cblas_dznrm2(n, + nrm2_buffer[nrm2_offset].real(cblas_dznrm2(static_cast<int>(n), reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc))); } void cblasXnrm2(const size_t n, @@ -417,25 +417,25 @@ void cblasXnrm2(const size_t n, void cblasXasum(const size_t n, std::vector<float>& asum_buffer, const size_t asum_offset, const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) { - asum_buffer[asum_offset] = cblas_sasum(n, + asum_buffer[asum_offset] = cblas_sasum(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc)); } void cblasXasum(const size_t n, std::vector<double>& asum_buffer, const size_t asum_offset, const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) { - asum_buffer[asum_offset] = cblas_dasum(n, + asum_buffer[asum_offset] = cblas_dasum(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc)); } void cblasXasum(const size_t n, std::vector<float2>& asum_buffer, const size_t asum_offset, const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) { - asum_buffer[asum_offset].real(cblas_scasum(n, + asum_buffer[asum_offset].real(cblas_scasum(static_cast<int>(n), reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc))); } void cblasXasum(const size_t n, std::vector<double2>& asum_buffer, const size_t asum_offset, const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) { - asum_buffer[asum_offset].real(cblas_dzasum(n, + asum_buffer[asum_offset].real(cblas_dzasum(static_cast<int>(n), reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc))); } void cblasXasum(const size_t n, @@ -453,25 +453,25 @@ void cblasXasum(const size_t n, void cblasXamax(const size_t n, std::vector<float>& imax_buffer, const size_t imax_offset, const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) { - ((int*)&imax_buffer[0])[imax_offset] = cblas_isamax(n, + ((int*)&imax_buffer[0])[imax_offset] = cblas_isamax(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc)); } void cblasXamax(const size_t n, std::vector<double>& imax_buffer, const size_t imax_offset, const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) { - ((int*)&imax_buffer[0])[imax_offset] = cblas_idamax(n, + ((int*)&imax_buffer[0])[imax_offset] = cblas_idamax(static_cast<int>(n), &x_buffer[x_offset], static_cast<int>(x_inc)); } void cblasXamax(const size_t n, std::vector<float2>& imax_buffer, const size_t imax_offset, const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) { - ((int*)&imax_buffer[0])[imax_offset] = cblas_icamax(n, + ((int*)&imax_buffer[0])[imax_offset] = cblas_icamax(static_cast<int>(n), reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } void cblasXamax(const size_t n, std::vector<double2>& imax_buffer, const size_t imax_offset, const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) { - ((int*)&imax_buffer[0])[imax_offset] = cblas_izamax(n, + ((int*)&imax_buffer[0])[imax_offset] = cblas_izamax(static_cast<int>(n), reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } void cblasXamax(const size_t n, @@ -498,7 +498,7 @@ void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const float beta, std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_sgemv(layout, a_transpose, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc), @@ -513,7 +513,7 @@ void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const double beta, std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_dgemv(layout, a_transpose, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc), @@ -530,7 +530,7 @@ void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<float>{beta.real(), beta.imag()}; cblas_cgemv(layout, a_transpose, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), @@ -547,7 +547,7 @@ void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<double>{beta.real(), beta.imag()}; cblas_zgemv(layout, a_transpose, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), @@ -583,7 +583,7 @@ void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const float beta, std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_sgbmv(layout, a_transpose, - m, n, kl, ku, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku), alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc), @@ -598,7 +598,7 @@ void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const double beta, std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_dgbmv(layout, a_transpose, - m, n, kl, ku, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku), alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc), @@ -615,7 +615,7 @@ void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<float>{beta.real(), beta.imag()}; cblas_cgbmv(layout, a_transpose, - m, n, kl, ku, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku), alpha_array.data(), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), @@ -632,7 +632,7 @@ void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<double>{beta.real(), beta.imag()}; cblas_zgbmv(layout, a_transpose, - m, n, kl, ku, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku), alpha_array.data(), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), @@ -670,7 +670,7 @@ void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<float>{beta.real(), beta.imag()}; cblas_chemv(layout, triangle, - n, + static_cast<int>(n), alpha_array.data(), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), @@ -687,7 +687,7 @@ void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<double>{beta.real(), beta.imag()}; cblas_zhemv(layout, triangle, - n, + static_cast<int>(n), alpha_array.data(), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), @@ -706,7 +706,7 @@ void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<float>{beta.real(), beta.imag()}; cblas_chbmv(layout, triangle, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha_array.data(), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), @@ -723,7 +723,7 @@ void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<double>{beta.real(), beta.imag()}; cblas_zhbmv(layout, triangle, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha_array.data(), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), @@ -742,7 +742,7 @@ void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<float>{beta.real(), beta.imag()}; cblas_chpmv(layout, triangle, - n, + static_cast<int>(n), alpha_array.data(), reinterpret_cast<const float*>(&ap_buffer[ap_offset]), reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), @@ -759,7 +759,7 @@ void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<double>{beta.real(), beta.imag()}; cblas_zhpmv(layout, triangle, - n, + static_cast<int>(n), alpha_array.data(), reinterpret_cast<const double*>(&ap_buffer[ap_offset]), reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), @@ -776,7 +776,7 @@ void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const float beta, std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_ssymv(layout, triangle, - n, + static_cast<int>(n), alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc), @@ -791,7 +791,7 @@ void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const double beta, std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_dsymv(layout, triangle, - n, + static_cast<int>(n), alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc), @@ -827,7 +827,7 @@ void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const float beta, std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_ssbmv(layout, triangle, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc), @@ -842,7 +842,7 @@ void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const double beta, std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_dsbmv(layout, triangle, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha, &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc), @@ -878,7 +878,7 @@ void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const float beta, std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_sspmv(layout, triangle, - n, + static_cast<int>(n), alpha, &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast<int>(x_inc), @@ -893,7 +893,7 @@ void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const double beta, std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) { cblas_dspmv(layout, triangle, - n, + static_cast<int>(n), alpha, &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast<int>(x_inc), @@ -926,7 +926,7 @@ void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_strmv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -935,7 +935,7 @@ void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dtrmv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -944,7 +944,7 @@ void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ctrmv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -953,7 +953,7 @@ void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ztrmv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -976,7 +976,7 @@ void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_stbmv(layout, triangle, a_transpose, diagonal, - n, k, + static_cast<int>(n), static_cast<int>(k), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -985,7 +985,7 @@ void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dtbmv(layout, triangle, a_transpose, diagonal, - n, k, + static_cast<int>(n), static_cast<int>(k), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -994,7 +994,7 @@ void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ctbmv(layout, triangle, a_transpose, diagonal, - n, k, + static_cast<int>(n), static_cast<int>(k), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -1003,7 +1003,7 @@ void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ztbmv(layout, triangle, a_transpose, diagonal, - n, k, + static_cast<int>(n), static_cast<int>(k), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -1026,7 +1026,7 @@ void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<float>& ap_buffer, const size_t ap_offset, std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_stpmv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -1035,7 +1035,7 @@ void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<double>& ap_buffer, const size_t ap_offset, std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dtpmv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -1044,7 +1044,7 @@ void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<float2>& ap_buffer, const size_t ap_offset, std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ctpmv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), reinterpret_cast<const float*>(&ap_buffer[ap_offset]), reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -1053,7 +1053,7 @@ void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<double2>& ap_buffer, const size_t ap_offset, std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ztpmv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), reinterpret_cast<const double*>(&ap_buffer[ap_offset]), reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -1076,7 +1076,7 @@ void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_strsv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -1085,7 +1085,7 @@ void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dtrsv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -1094,7 +1094,7 @@ void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ctrsv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -1103,7 +1103,7 @@ void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ztrsv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -1114,7 +1114,7 @@ void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_stbsv(layout, triangle, a_transpose, diagonal, - n, k, + static_cast<int>(n), static_cast<int>(k), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -1123,7 +1123,7 @@ void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dtbsv(layout, triangle, a_transpose, diagonal, - n, k, + static_cast<int>(n), static_cast<int>(k), &a_buffer[a_offset], a_ld, &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -1132,7 +1132,7 @@ void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ctbsv(layout, triangle, a_transpose, diagonal, - n, k, + static_cast<int>(n), static_cast<int>(k), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -1141,7 +1141,7 @@ void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ztbsv(layout, triangle, a_transpose, diagonal, - n, k, + static_cast<int>(n), static_cast<int>(k), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -1152,7 +1152,7 @@ void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<float>& ap_buffer, const size_t ap_offset, std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_stpsv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -1161,7 +1161,7 @@ void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<double>& ap_buffer, const size_t ap_offset, std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_dtpsv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), &ap_buffer[ap_offset], &x_buffer[x_offset], static_cast<int>(x_inc)); } @@ -1170,7 +1170,7 @@ void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<float2>& ap_buffer, const size_t ap_offset, std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ctpsv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), reinterpret_cast<const float*>(&ap_buffer[ap_offset]), reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -1179,7 +1179,7 @@ void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const std::vector<double2>& ap_buffer, const size_t ap_offset, std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) { cblas_ztpsv(layout, triangle, a_transpose, diagonal, - n, + static_cast<int>(n), reinterpret_cast<const double*>(&ap_buffer[ap_offset]), reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } @@ -1192,7 +1192,7 @@ void cblasXger(const CBLAS_ORDER layout, const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc, std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_sger(layout, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc), @@ -1205,7 +1205,7 @@ void cblasXger(const CBLAS_ORDER layout, const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc, std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_dger(layout, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc), @@ -1238,7 +1238,7 @@ void cblasXgeru(const CBLAS_ORDER layout, std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) { const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; cblas_cgeru(layout, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc), @@ -1252,7 +1252,7 @@ void cblasXgeru(const CBLAS_ORDER layout, std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) { const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; cblas_zgeru(layout, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc), @@ -1268,7 +1268,7 @@ void cblasXgerc(const CBLAS_ORDER layout, std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) { const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; cblas_cgerc(layout, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc), @@ -1282,7 +1282,7 @@ void cblasXgerc(const CBLAS_ORDER layout, std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) { const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; cblas_zgerc(layout, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc), @@ -1296,7 +1296,7 @@ void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_cher(layout, triangle, - n, + static_cast<int>(n), alpha, reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<float*>(&a_buffer[a_offset]), a_ld); @@ -1307,7 +1307,7 @@ void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_zher(layout, triangle, - n, + static_cast<int>(n), alpha, reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<double*>(&a_buffer[a_offset]), a_ld); @@ -1320,7 +1320,7 @@ void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<float2>& ap_buffer, const size_t ap_offset) { cblas_chpr(layout, triangle, - n, + static_cast<int>(n), alpha, reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<float*>(&ap_buffer[ap_offset])); @@ -1331,7 +1331,7 @@ void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<double2>& ap_buffer, const size_t ap_offset) { cblas_zhpr(layout, triangle, - n, + static_cast<int>(n), alpha, reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<double*>(&ap_buffer[ap_offset])); @@ -1346,7 +1346,7 @@ void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) { const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; cblas_cher2(layout, triangle, - n, + static_cast<int>(n), alpha_array.data(), reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc), @@ -1360,7 +1360,7 @@ void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) { const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; cblas_zher2(layout, triangle, - n, + static_cast<int>(n), alpha_array.data(), reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc), @@ -1376,7 +1376,7 @@ void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, std::vector<float2>& ap_buffer, const size_t ap_offset) { const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; cblas_chpr2(layout, triangle, - n, + static_cast<int>(n), alpha_array.data(), reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc), @@ -1390,7 +1390,7 @@ void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, std::vector<double2>& ap_buffer, const size_t ap_offset) { const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; cblas_zhpr2(layout, triangle, - n, + static_cast<int>(n), alpha_array.data(), reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc), @@ -1404,7 +1404,7 @@ void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_ssyr(layout, triangle, - n, + static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc), &a_buffer[a_offset], a_ld); @@ -1415,7 +1415,7 @@ void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_dsyr(layout, triangle, - n, + static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc), &a_buffer[a_offset], a_ld); @@ -1442,7 +1442,7 @@ void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<float>& ap_buffer, const size_t ap_offset) { cblas_sspr(layout, triangle, - n, + static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc), &ap_buffer[ap_offset]); @@ -1453,7 +1453,7 @@ void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc, std::vector<double>& ap_buffer, const size_t ap_offset) { cblas_dspr(layout, triangle, - n, + static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc), &ap_buffer[ap_offset]); @@ -1481,7 +1481,7 @@ void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc, std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_ssyr2(layout, triangle, - n, + static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc), @@ -1494,7 +1494,7 @@ void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc, std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld) { cblas_dsyr2(layout, triangle, - n, + static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc), @@ -1526,7 +1526,7 @@ void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc, std::vector<float>& ap_buffer, const size_t ap_offset) { cblas_sspr2(layout, triangle, - n, + static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc), @@ -1539,7 +1539,7 @@ void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc, std::vector<double>& ap_buffer, const size_t ap_offset) { cblas_dspr2(layout, triangle, - n, + static_cast<int>(n), alpha, &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc), @@ -1576,7 +1576,7 @@ void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, con const float beta, std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_sgemm(layout, a_transpose, b_transpose, - m, n, k, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld, @@ -1591,7 +1591,7 @@ void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, con const double beta, std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_dgemm(layout, a_transpose, b_transpose, - m, n, k, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld, @@ -1608,7 +1608,7 @@ void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, con const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<float>{beta.real(), beta.imag()}; cblas_cgemm(layout, a_transpose, b_transpose, - m, n, k, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), alpha_array.data(), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld, @@ -1625,7 +1625,7 @@ void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, con const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<double>{beta.real(), beta.imag()}; cblas_zgemm(layout, a_transpose, b_transpose, - m, n, k, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), alpha_array.data(), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld, @@ -1661,7 +1661,7 @@ void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL const float beta, std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_ssymm(layout, side, triangle, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld, @@ -1676,7 +1676,7 @@ void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL const double beta, std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_dsymm(layout, side, triangle, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld, @@ -1693,7 +1693,7 @@ void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<float>{beta.real(), beta.imag()}; cblas_csymm(layout, side, triangle, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld, @@ -1710,7 +1710,7 @@ void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<double>{beta.real(), beta.imag()}; cblas_zsymm(layout, side, triangle, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld, @@ -1748,7 +1748,7 @@ void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<float>{beta.real(), beta.imag()}; cblas_chemm(layout, side, triangle, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld, @@ -1765,7 +1765,7 @@ void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<double>{beta.real(), beta.imag()}; cblas_zhemm(layout, side, triangle, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld, @@ -1781,7 +1781,7 @@ void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const float beta, std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_ssyrk(layout, triangle, a_transpose, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha, &a_buffer[a_offset], a_ld, beta, @@ -1794,7 +1794,7 @@ void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const double beta, std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_dsyrk(layout, triangle, a_transpose, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha, &a_buffer[a_offset], a_ld, beta, @@ -1809,7 +1809,7 @@ void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<float>{beta.real(), beta.imag()}; cblas_csyrk(layout, triangle, a_transpose, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha_array.data(), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, beta_array.data(), @@ -1824,7 +1824,7 @@ void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<double>{beta.real(), beta.imag()}; cblas_zsyrk(layout, triangle, a_transpose, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha_array.data(), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, beta_array.data(), @@ -1855,7 +1855,7 @@ void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const float beta, std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_cherk(layout, triangle, a_transpose, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha, reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, beta, @@ -1868,7 +1868,7 @@ void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS const double beta, std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_zherk(layout, triangle, a_transpose, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha, reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, beta, @@ -1884,7 +1884,7 @@ void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA const float beta, std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_ssyr2k(layout, triangle, ab_transpose, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld, @@ -1899,7 +1899,7 @@ void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA const double beta, std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) { cblas_dsyr2k(layout, triangle, ab_transpose, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld, @@ -1916,7 +1916,7 @@ void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<float>{beta.real(), beta.imag()}; cblas_csyr2k(layout, triangle, ab_transpose, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha_array.data(), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld, @@ -1933,7 +1933,7 @@ void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; const auto beta_array = std::vector<double>{beta.real(), beta.imag()}; cblas_zsyr2k(layout, triangle, ab_transpose, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha_array.data(), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld, @@ -1970,7 +1970,7 @@ void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; cblas_cher2k(layout, triangle, ab_transpose, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha_array.data(), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld, @@ -1986,7 +1986,7 @@ void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) { const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; cblas_zher2k(layout, triangle, ab_transpose, - n, k, + static_cast<int>(n), static_cast<int>(k), alpha_array.data(), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld, @@ -2001,7 +2001,7 @@ void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld) { cblas_strmm(layout, side, triangle, a_transpose, diagonal, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld); @@ -2012,7 +2012,7 @@ void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld) { cblas_dtrmm(layout, side, triangle, a_transpose, diagonal, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld); @@ -2024,7 +2024,7 @@ void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld) { const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; cblas_ctrmm(layout, side, triangle, a_transpose, diagonal, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<float*>(&b_buffer[b_offset]), b_ld); @@ -2036,7 +2036,7 @@ void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld) { const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; cblas_ztrmm(layout, side, triangle, a_transpose, diagonal, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld); @@ -2063,7 +2063,7 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld) { cblas_strsm(layout, side, triangle, a_transpose, diagonal, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld); @@ -2074,7 +2074,7 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld, std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld) { cblas_dtrsm(layout, side, triangle, a_transpose, diagonal, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha, &a_buffer[a_offset], a_ld, &b_buffer[b_offset], b_ld); @@ -2086,7 +2086,7 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld) { const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()}; cblas_ctrsm(layout, side, triangle, a_transpose, diagonal, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<float*>(&b_buffer[b_offset]), b_ld); @@ -2098,7 +2098,7 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld) { const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()}; cblas_ztrsm(layout, side, triangle, a_transpose, diagonal, - m, n, + static_cast<int>(m), static_cast<int>(n), alpha_array.data(), reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld); diff --git a/test/wrapper_cublas.hpp b/test/wrapper_cublas.hpp new file mode 100644 index 00000000..35b1b9c6 --- /dev/null +++ b/test/wrapper_cublas.hpp @@ -0,0 +1,2548 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a wrapper around the cuBLAS library, such that its routines can be called +// in a similar way as the CLBlast routines: using alpha and beta to determine the precision. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_WRAPPER_CUBLAS_H_ +#define CLBLAST_TEST_WRAPPER_CUBLAS_H_ + +#include <cuda_runtime.h> +#include <cublas_v2.h> + +#include "utilities/utilities.hpp" + +namespace clblast { + +// Conversions from CLBlast types +cublasOperation_t convertToCUBLAS(const Transpose v) { return (v == Transpose::kNo) ? CUBLAS_OP_N : (v == Transpose::kYes) ? CUBLAS_OP_T : CUBLAS_OP_C; } +cublasFillMode_t convertToCUBLAS(const Triangle v) { return (v == Triangle::kUpper) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; } +cublasDiagType_t convertToCUBLAS(const Diagonal v) { return (v == Diagonal::kUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; } +cublasSideMode_t convertToCUBLAS(const Side v) { return (v == Side::kLeft) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; } + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Forwards the cuBLAS calls for SROTG/DROTG +template <typename T> +cublasStatus_t cublasXrotg(cublasHandle_t handle, T* sa_buffer, const size_t sa_offset, + T* sb_buffer, const size_t sb_offset, + T* sc_buffer, const size_t sc_offset, + T* ss_buffer, const size_t ss_offset); +template <> +cublasStatus_t cublasXrotg<float>(cublasHandle_t handle, float* sa_buffer, const size_t sa_offset, + float* sb_buffer, const size_t sb_offset, + float* sc_buffer, const size_t sc_offset, + float* ss_buffer, const size_t ss_offset) { + auto status = cublasSrotg(handle, &sa_buffer[sa_offset], + &sb_buffer[sb_offset], + &sc_buffer[sc_offset], + &ss_buffer[ss_offset]); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXrotg<double>(cublasHandle_t handle, double* sa_buffer, const size_t sa_offset, + double* sb_buffer, const size_t sb_offset, + double* sc_buffer, const size_t sc_offset, + double* ss_buffer, const size_t ss_offset) { + auto status = cublasDrotg(handle, &sa_buffer[sa_offset], + &sb_buffer[sb_offset], + &sc_buffer[sc_offset], + &ss_buffer[ss_offset]); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for SROTMG/DROTMG +template <typename T> +cublasStatus_t cublasXrotmg(cublasHandle_t handle, T* sd1_buffer, const size_t sd1_offset, + T* sd2_buffer, const size_t sd2_offset, + T* sx1_buffer, const size_t sx1_offset, + const T* sy1_buffer, const size_t sy1_offset, + T* sparam_buffer, const size_t sparam_offset); +template <> +cublasStatus_t cublasXrotmg<float>(cublasHandle_t handle, float* sd1_buffer, const size_t sd1_offset, + float* sd2_buffer, const size_t sd2_offset, + float* sx1_buffer, const size_t sx1_offset, + const float* sy1_buffer, const size_t sy1_offset, + float* sparam_buffer, const size_t sparam_offset) { + auto status = cublasSrotmg(handle, &sd1_buffer[sd1_offset], + &sd2_buffer[sd2_offset], + &sx1_buffer[sx1_offset], + &sy1_buffer[sy1_offset], + &sparam_buffer[sparam_offset]); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXrotmg<double>(cublasHandle_t handle, double* sd1_buffer, const size_t sd1_offset, + double* sd2_buffer, const size_t sd2_offset, + double* sx1_buffer, const size_t sx1_offset, + const double* sy1_buffer, const size_t sy1_offset, + double* sparam_buffer, const size_t sparam_offset) { + auto status = cublasDrotmg(handle, &sd1_buffer[sd1_offset], + &sd2_buffer[sd2_offset], + &sx1_buffer[sx1_offset], + &sy1_buffer[sy1_offset], + &sparam_buffer[sparam_offset]); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for SROT/DROT +cublasStatus_t cublasXrot(cublasHandle_t handle, const size_t n, + float* x_buffer, const size_t x_offset, const size_t x_inc, + float* y_buffer, const size_t y_offset, const size_t y_inc, + const float cos, + const float sin) { + auto status = cublasSrot(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc), + &cos, + &sin); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXrot(cublasHandle_t handle, const size_t n, + double* x_buffer, const size_t x_offset, const size_t x_inc, + double* y_buffer, const size_t y_offset, const size_t y_inc, + const double cos, + const double sin) { + auto status = cublasDrot(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc), + &cos, + &sin); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for SROTM/DROTM +template <typename T> +cublasStatus_t cublasXrotm(cublasHandle_t handle, const size_t n, + T* x_buffer, const size_t x_offset, const size_t x_inc, + T* y_buffer, const size_t y_offset, const size_t y_inc, + T* sparam_buffer, const size_t sparam_offset); +template <> +cublasStatus_t cublasXrotm<float>(cublasHandle_t handle, const size_t n, + float* x_buffer, const size_t x_offset, const size_t x_inc, + float* y_buffer, const size_t y_offset, const size_t y_inc, + float* sparam_buffer, const size_t sparam_offset) { + auto status = cublasSrotm(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc), + &sparam_buffer[sparam_offset]); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXrotm<double>(cublasHandle_t handle, const size_t n, + double* x_buffer, const size_t x_offset, const size_t x_inc, + double* y_buffer, const size_t y_offset, const size_t y_inc, + double* sparam_buffer, const size_t sparam_offset) { + auto status = cublasDrotm(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc), + &sparam_buffer[sparam_offset]); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP +template <typename T> +cublasStatus_t cublasXswap(cublasHandle_t handle, const size_t n, + T* x_buffer, const size_t x_offset, const size_t x_inc, + T* y_buffer, const size_t y_offset, const size_t y_inc); +template <> +cublasStatus_t cublasXswap<float>(cublasHandle_t handle, const size_t n, + float* x_buffer, const size_t x_offset, const size_t x_inc, + float* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasSswap(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXswap<double>(cublasHandle_t handle, const size_t n, + double* x_buffer, const size_t x_offset, const size_t x_inc, + double* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasDswap(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXswap<float2>(cublasHandle_t handle, const size_t n, + float2* x_buffer, const size_t x_offset, const size_t x_inc, + float2* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasCswap(handle, static_cast<int>(n), + reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXswap<double2>(cublasHandle_t handle, const size_t n, + double2* x_buffer, const size_t x_offset, const size_t x_inc, + double2* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasZswap(handle, static_cast<int>(n), + reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXswap<half>(cublasHandle_t handle, const size_t n, + half* x_buffer, const size_t x_offset, const size_t x_inc, + half* y_buffer, const size_t y_offset, const size_t y_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL +cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n, + const float alpha, + float* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasSscal(handle, static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n, + const double alpha, + double* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasDscal(handle, static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n, + const float2 alpha, + float2* x_buffer, const size_t x_offset, const size_t x_inc) { + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasCscal(handle, static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n, + const double2 alpha, + double2* x_buffer, const size_t x_offset, const size_t x_inc) { + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasZscal(handle, static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXscal(cublasHandle_t handle, const size_t n, + const half alpha, + half* x_buffer, const size_t x_offset, const size_t x_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY +template <typename T> +cublasStatus_t cublasXcopy(cublasHandle_t handle, const size_t n, + const T* x_buffer, const size_t x_offset, const size_t x_inc, + T* y_buffer, const size_t y_offset, const size_t y_inc); +template <> +cublasStatus_t cublasXcopy<float>(cublasHandle_t handle, const size_t n, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + float* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasScopy(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXcopy<double>(cublasHandle_t handle, const size_t n, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + double* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasDcopy(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXcopy<float2>(cublasHandle_t handle, const size_t n, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + float2* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasCcopy(handle, static_cast<int>(n), + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXcopy<double2>(cublasHandle_t handle, const size_t n, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + double2* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasZcopy(handle, static_cast<int>(n), + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXcopy<half>(cublasHandle_t handle, const size_t n, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + half* y_buffer, const size_t y_offset, const size_t y_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY +cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n, + const float alpha, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + float* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasSaxpy(handle, static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n, + const double alpha, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + double* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasDaxpy(handle, static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n, + const float2 alpha, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + float2* y_buffer, const size_t y_offset, const size_t y_inc) { + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasCaxpy(handle, static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n, + const double2 alpha, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + double2* y_buffer, const size_t y_offset, const size_t y_inc) { + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasZaxpy(handle, static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXaxpy(cublasHandle_t handle, const size_t n, + const half alpha, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + half* y_buffer, const size_t y_offset, const size_t y_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for SDOT/DDOT +template <typename T> +cublasStatus_t cublasXdot(cublasHandle_t handle, const size_t n, + T* dot_buffer, const size_t dot_offset, + const T* x_buffer, const size_t x_offset, const size_t x_inc, + const T* y_buffer, const size_t y_offset, const size_t y_inc); +template <> +cublasStatus_t cublasXdot<float>(cublasHandle_t handle, const size_t n, + float* dot_buffer, const size_t dot_offset, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + const float* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasSdot(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc), + &dot_buffer[dot_offset]); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXdot<double>(cublasHandle_t handle, const size_t n, + double* dot_buffer, const size_t dot_offset, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + const double* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasDdot(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc), + &dot_buffer[dot_offset]); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXdot<half>(cublasHandle_t handle, const size_t n, + half* dot_buffer, const size_t dot_offset, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + const half* y_buffer, const size_t y_offset, const size_t y_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for CDOTU/ZDOTU +template <typename T> +cublasStatus_t cublasXdotu(cublasHandle_t handle, const size_t n, + T* dot_buffer, const size_t dot_offset, + const T* x_buffer, const size_t x_offset, const size_t x_inc, + const T* y_buffer, const size_t y_offset, const size_t y_inc); +template <> +cublasStatus_t cublasXdotu<float2>(cublasHandle_t handle, const size_t n, + float2* dot_buffer, const size_t dot_offset, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + const float2* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasCdotu(handle, static_cast<int>(n), + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc), + reinterpret_cast<cuComplex*>(&dot_buffer[dot_offset])); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXdotu<double2>(cublasHandle_t handle, const size_t n, + double2* dot_buffer, const size_t dot_offset, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + const double2* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasZdotu(handle, static_cast<int>(n), + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc), + reinterpret_cast<cuDoubleComplex*>(&dot_buffer[dot_offset])); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for CDOTC/ZDOTC +template <typename T> +cublasStatus_t cublasXdotc(cublasHandle_t handle, const size_t n, + T* dot_buffer, const size_t dot_offset, + const T* x_buffer, const size_t x_offset, const size_t x_inc, + const T* y_buffer, const size_t y_offset, const size_t y_inc); +template <> +cublasStatus_t cublasXdotc<float2>(cublasHandle_t handle, const size_t n, + float2* dot_buffer, const size_t dot_offset, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + const float2* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasCdotc(handle, static_cast<int>(n), + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc), + reinterpret_cast<cuComplex*>(&dot_buffer[dot_offset])); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXdotc<double2>(cublasHandle_t handle, const size_t n, + double2* dot_buffer, const size_t dot_offset, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + const double2* y_buffer, const size_t y_offset, const size_t y_inc) { + auto status = cublasZdotc(handle, static_cast<int>(n), + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc), + reinterpret_cast<cuDoubleComplex*>(&dot_buffer[dot_offset])); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2 +template <typename T> +cublasStatus_t cublasXnrm2(cublasHandle_t handle, const size_t n, + T* nrm2_buffer, const size_t nrm2_offset, + const T* x_buffer, const size_t x_offset, const size_t x_inc); +template <> +cublasStatus_t cublasXnrm2<float>(cublasHandle_t handle, const size_t n, + float* nrm2_buffer, const size_t nrm2_offset, + const float* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasSnrm2(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &nrm2_buffer[nrm2_offset]); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXnrm2<double>(cublasHandle_t handle, const size_t n, + double* nrm2_buffer, const size_t nrm2_offset, + const double* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasDnrm2(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &nrm2_buffer[nrm2_offset]); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXnrm2<float2>(cublasHandle_t handle, const size_t n, + float2* nrm2_buffer, const size_t nrm2_offset, + const float2* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasScnrm2(handle, static_cast<int>(n), + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<float*>(&nrm2_buffer[nrm2_offset])); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXnrm2<double2>(cublasHandle_t handle, const size_t n, + double2* nrm2_buffer, const size_t nrm2_offset, + const double2* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasDznrm2(handle, static_cast<int>(n), + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<double*>(&nrm2_buffer[nrm2_offset])); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXnrm2<half>(cublasHandle_t handle, const size_t n, + half* nrm2_buffer, const size_t nrm2_offset, + const half* x_buffer, const size_t x_offset, const size_t x_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for SASUM/DASUM/ScASUM/DzASUM +template <typename T> +cublasStatus_t cublasXasum(cublasHandle_t handle, const size_t n, + T* asum_buffer, const size_t asum_offset, + const T* x_buffer, const size_t x_offset, const size_t x_inc); +template <> +cublasStatus_t cublasXasum<float>(cublasHandle_t handle, const size_t n, + float* asum_buffer, const size_t asum_offset, + const float* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasSasum(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &asum_buffer[asum_offset]); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXasum<double>(cublasHandle_t handle, const size_t n, + double* asum_buffer, const size_t asum_offset, + const double* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasDasum(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + &asum_buffer[asum_offset]); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXasum<float2>(cublasHandle_t handle, const size_t n, + float2* asum_buffer, const size_t asum_offset, + const float2* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasScasum(handle, static_cast<int>(n), + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<float*>(&asum_buffer[asum_offset])); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXasum<double2>(cublasHandle_t handle, const size_t n, + double2* asum_buffer, const size_t asum_offset, + const double2* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasDzasum(handle, static_cast<int>(n), + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<double*>(&asum_buffer[asum_offset])); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXasum<half>(cublasHandle_t handle, const size_t n, + half* asum_buffer, const size_t asum_offset, + const half* x_buffer, const size_t x_offset, const size_t x_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +template <typename T> +cublasStatus_t cublasXamax(cublasHandle_t handle, const size_t n, + T* imax_buffer, const size_t imax_offset, + const T* x_buffer, const size_t x_offset, const size_t x_inc); +template <> +cublasStatus_t cublasXamax<float>(cublasHandle_t handle, const size_t n, + float* imax_buffer, const size_t imax_offset, + const float* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasIsamax(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + reinterpret_cast<int*>(&imax_buffer[imax_offset])); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXamax<double>(cublasHandle_t handle, const size_t n, + double* imax_buffer, const size_t imax_offset, + const double* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasIdamax(handle, static_cast<int>(n), + &x_buffer[x_offset], static_cast<int>(x_inc), + reinterpret_cast<int*>(&imax_buffer[imax_offset])); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXamax<float2>(cublasHandle_t handle, const size_t n, + float2* imax_buffer, const size_t imax_offset, + const float2* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasIcamax(handle, static_cast<int>(n), + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<int*>(&imax_buffer[imax_offset])); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXamax<double2>(cublasHandle_t handle, const size_t n, + double2* imax_buffer, const size_t imax_offset, + const double2* x_buffer, const size_t x_offset, const size_t x_inc) { + auto status = cublasIzamax(handle, static_cast<int>(n), + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<int*>(&imax_buffer[imax_offset])); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXamax<half>(cublasHandle_t handle, const size_t n, + half* imax_buffer, const size_t imax_offset, + const half* x_buffer, const size_t x_offset, const size_t x_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// Forwards the cuBLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV +cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, + const size_t m, const size_t n, + const float alpha, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + float* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSgemv(handle, a_transpose, + static_cast<int>(m), static_cast<int>(n), + &alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc), + &beta, + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, + const size_t m, const size_t n, + const double alpha, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + double* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDgemv(handle, a_transpose, + static_cast<int>(m), static_cast<int>(n), + &alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc), + &beta, + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, + const size_t m, const size_t n, + const float2 alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + float2* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasCgemv(handle, a_transpose, + static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + &beta_cuda, + reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, + const size_t m, const size_t n, + const double2 alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + double2* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuDoubleComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasZgemv(handle, a_transpose, + static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + &beta_cuda, + reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgemv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, + const size_t m, const size_t n, + const half alpha, + const half* a_buffer, const size_t a_offset, const size_t a_ld, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + half* y_buffer, const size_t y_offset, const size_t y_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV +cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + float* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSgbmv(handle, a_transpose, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku), + &alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc), + &beta, + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + double* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDgbmv(handle, a_transpose, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku), + &alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc), + &beta, + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float2 alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + float2* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasCgbmv(handle, a_transpose, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + &beta_cuda, + reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double2 alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + double2* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuDoubleComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasZgbmv(handle, a_transpose, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(kl), static_cast<int>(ku), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + &beta_cuda, + reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgbmv(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const half alpha, + const half* a_buffer, const size_t a_offset, const size_t a_ld, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + half* y_buffer, const size_t y_offset, const size_t y_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for CHEMV/ZHEMV +cublasStatus_t cublasXhemv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const float2 alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + float2* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasChemv(handle, triangle, + static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + &beta_cuda, + reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXhemv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const double2 alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + double2* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuDoubleComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasZhemv(handle, triangle, + static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + &beta_cuda, + reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for CHBMV/ZHBMV +cublasStatus_t cublasXhbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, const size_t k, + const float2 alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + float2* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasChbmv(handle, triangle, + static_cast<int>(n), static_cast<int>(k), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + &beta_cuda, + reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXhbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, const size_t k, + const double2 alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + double2* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuDoubleComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasZhbmv(handle, triangle, + static_cast<int>(n), static_cast<int>(k), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + &beta_cuda, + reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for CHPMV/ZHPMV +cublasStatus_t cublasXhpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const float2 alpha, + const float2* ap_buffer, const size_t ap_offset, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + float2* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasChpmv(handle, triangle, + static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&ap_buffer[ap_offset]), + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + &beta_cuda, + reinterpret_cast<cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXhpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const double2 alpha, + const double2* ap_buffer, const size_t ap_offset, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + double2* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuDoubleComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasZhpmv(handle, triangle, + static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&ap_buffer[ap_offset]), + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + &beta_cuda, + reinterpret_cast<cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for SSYMV/DSYMV +cublasStatus_t cublasXsymv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const float alpha, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + float* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSsymv(handle, triangle, + static_cast<int>(n), + &alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc), + &beta, + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsymv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const double alpha, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + double* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDsymv(handle, triangle, + static_cast<int>(n), + &alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc), + &beta, + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsymv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const half alpha, + const half* a_buffer, const size_t a_offset, const size_t a_ld, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + half* y_buffer, const size_t y_offset, const size_t y_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for SSBMV/DSBMV +cublasStatus_t cublasXsbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, const size_t k, + const float alpha, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + float* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSsbmv(handle, triangle, + static_cast<int>(n), static_cast<int>(k), + &alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc), + &beta, + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, const size_t k, + const double alpha, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + double* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDsbmv(handle, triangle, + static_cast<int>(n), static_cast<int>(k), + &alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc), + &beta, + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, const size_t k, + const half alpha, + const half* a_buffer, const size_t a_offset, const size_t a_ld, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + half* y_buffer, const size_t y_offset, const size_t y_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for SSPMV/DSPMV +cublasStatus_t cublasXspmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const float alpha, + const float* ap_buffer, const size_t ap_offset, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + float* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSspmv(handle, triangle, + static_cast<int>(n), + &alpha, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast<int>(x_inc), + &beta, + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXspmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const double alpha, + const double* ap_buffer, const size_t ap_offset, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + double* y_buffer, const size_t y_offset, const size_t y_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDspmv(handle, triangle, + static_cast<int>(n), + &alpha, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast<int>(x_inc), + &beta, + &y_buffer[y_offset], static_cast<int>(y_inc)); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXspmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const half alpha, + const half* ap_buffer, const size_t ap_offset, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + half* y_buffer, const size_t y_offset, const size_t y_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for STRMV/DTRMV/CTRMV/ZTRMV +template <typename T> +cublasStatus_t cublasXtrmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const T* a_buffer, const size_t a_offset, const size_t a_ld, + T* x_buffer, const size_t x_offset, const size_t x_inc); +template <> +cublasStatus_t cublasXtrmv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + float* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasStrmv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtrmv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + double* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDtrmv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtrmv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + float2* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasCtrmv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtrmv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + double2* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasZtrmv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtrmv<half>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const half* a_buffer, const size_t a_offset, const size_t a_ld, + half* x_buffer, const size_t x_offset, const size_t x_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for STBMV/DTBMV/CTBMV/ZTBMV +template <typename T> +cublasStatus_t cublasXtbmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, const size_t k, + const T* a_buffer, const size_t a_offset, const size_t a_ld, + T* x_buffer, const size_t x_offset, const size_t x_inc); +template <> +cublasStatus_t cublasXtbmv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, const size_t k, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + float* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasStbmv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), static_cast<int>(k), + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtbmv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, const size_t k, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + double* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDtbmv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), static_cast<int>(k), + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtbmv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, const size_t k, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + float2* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasCtbmv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), static_cast<int>(k), + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtbmv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, const size_t k, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + double2* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasZtbmv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), static_cast<int>(k), + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtbmv<half>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, const size_t k, + const half* a_buffer, const size_t a_offset, const size_t a_ld, + half* x_buffer, const size_t x_offset, const size_t x_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for STPMV/DTPMV/CTPMV/ZTPMV +template <typename T> +cublasStatus_t cublasXtpmv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const T* ap_buffer, const size_t ap_offset, + T* x_buffer, const size_t x_offset, const size_t x_inc); +template <> +cublasStatus_t cublasXtpmv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const float* ap_buffer, const size_t ap_offset, + float* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasStpmv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtpmv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const double* ap_buffer, const size_t ap_offset, + double* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDtpmv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtpmv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const float2* ap_buffer, const size_t ap_offset, + float2* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasCtpmv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + reinterpret_cast<const cuComplex*>(&ap_buffer[ap_offset]), + reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtpmv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const double2* ap_buffer, const size_t ap_offset, + double2* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasZtpmv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + reinterpret_cast<const cuDoubleComplex*>(&ap_buffer[ap_offset]), + reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtpmv<half>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const half* ap_buffer, const size_t ap_offset, + half* x_buffer, const size_t x_offset, const size_t x_inc) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for STRSV/DTRSV/CTRSV/ZTRSV +template <typename T> +cublasStatus_t cublasXtrsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const T* a_buffer, const size_t a_offset, const size_t a_ld, + T* x_buffer, const size_t x_offset, const size_t x_inc); +template <> +cublasStatus_t cublasXtrsv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + float* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasStrsv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtrsv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + double* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDtrsv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtrsv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + float2* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasCtrsv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtrsv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + double2* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasZtrsv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for STBSV/DTBSV/CTBSV/ZTBSV +template <typename T> +cublasStatus_t cublasXtbsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, const size_t k, + const T* a_buffer, const size_t a_offset, const size_t a_ld, + T* x_buffer, const size_t x_offset, const size_t x_inc); +template <> +cublasStatus_t cublasXtbsv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, const size_t k, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + float* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasStbsv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), static_cast<int>(k), + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtbsv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, const size_t k, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + double* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDtbsv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), static_cast<int>(k), + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtbsv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, const size_t k, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + float2* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasCtbsv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), static_cast<int>(k), + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtbsv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, const size_t k, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + double2* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasZtbsv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), static_cast<int>(k), + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for STPSV/DTPSV/CTPSV/ZTPSV +template <typename T> +cublasStatus_t cublasXtpsv(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const T* ap_buffer, const size_t ap_offset, + T* x_buffer, const size_t x_offset, const size_t x_inc); +template <> +cublasStatus_t cublasXtpsv<float>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const float* ap_buffer, const size_t ap_offset, + float* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasStpsv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtpsv<double>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const double* ap_buffer, const size_t ap_offset, + double* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDtpsv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtpsv<float2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const float2* ap_buffer, const size_t ap_offset, + float2* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasCtpsv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + reinterpret_cast<const cuComplex*>(&ap_buffer[ap_offset]), + reinterpret_cast<cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} +template <> +cublasStatus_t cublasXtpsv<double2>(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t n, + const double2* ap_buffer, const size_t ap_offset, + double2* x_buffer, const size_t x_offset, const size_t x_inc) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasZtpsv(handle, triangle, a_transpose, diagonal, + static_cast<int>(n), + reinterpret_cast<const cuDoubleComplex*>(&ap_buffer[ap_offset]), + reinterpret_cast<cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for SGER/DGER +cublasStatus_t cublasXger(cublasHandle_t handle, const Layout layout, + const size_t m, const size_t n, + const float alpha, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + const float* y_buffer, const size_t y_offset, const size_t y_inc, + float* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSger(handle, static_cast<int>(m), static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc), + &a_buffer[a_offset], a_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXger(cublasHandle_t handle, const Layout layout, + const size_t m, const size_t n, + const double alpha, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + const double* y_buffer, const size_t y_offset, const size_t y_inc, + double* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDger(handle, static_cast<int>(m), static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc), + &a_buffer[a_offset], a_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXger(cublasHandle_t handle, const Layout layout, + const size_t m, const size_t n, + const half alpha, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + const half* y_buffer, const size_t y_offset, const size_t y_inc, + half* a_buffer, const size_t a_offset, const size_t a_ld) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for CGERU/ZGERU +cublasStatus_t cublasXgeru(cublasHandle_t handle, const Layout layout, + const size_t m, const size_t n, + const float2 alpha, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + const float2* y_buffer, const size_t y_offset, const size_t y_inc, + float2* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasCgeru(handle, static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc), + reinterpret_cast<cuComplex*>(&a_buffer[a_offset]), a_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgeru(cublasHandle_t handle, const Layout layout, + const size_t m, const size_t n, + const double2 alpha, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + const double2* y_buffer, const size_t y_offset, const size_t y_inc, + double2* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasZgeru(handle, static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc), + reinterpret_cast<cuDoubleComplex*>(&a_buffer[a_offset]), a_ld); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for CGERC/ZGERC +cublasStatus_t cublasXgerc(cublasHandle_t handle, const Layout layout, + const size_t m, const size_t n, + const float2 alpha, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + const float2* y_buffer, const size_t y_offset, const size_t y_inc, + float2* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasCgerc(handle, static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc), + reinterpret_cast<cuComplex*>(&a_buffer[a_offset]), a_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgerc(cublasHandle_t handle, const Layout layout, + const size_t m, const size_t n, + const double2 alpha, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + const double2* y_buffer, const size_t y_offset, const size_t y_inc, + double2* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasZgerc(handle, static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc), + reinterpret_cast<cuDoubleComplex*>(&a_buffer[a_offset]), a_ld); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for CHER/ZHER +cublasStatus_t cublasXher(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const float alpha, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + float2* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasCher(handle, triangle, + static_cast<int>(n), + &alpha, + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<cuComplex*>(&a_buffer[a_offset]), a_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXher(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const double alpha, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + double2* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasZher(handle, triangle, + static_cast<int>(n), + &alpha, + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<cuDoubleComplex*>(&a_buffer[a_offset]), a_ld); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for CHPR/ZHPR +cublasStatus_t cublasXhpr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const float alpha, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + float2* ap_buffer, const size_t ap_offset) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasChpr(handle, triangle, + static_cast<int>(n), + &alpha, + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<cuComplex*>(&ap_buffer[ap_offset])); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXhpr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const double alpha, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + double2* ap_buffer, const size_t ap_offset) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasZhpr(handle, triangle, + static_cast<int>(n), + &alpha, + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<cuDoubleComplex*>(&ap_buffer[ap_offset])); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for CHER2/ZHER2 +cublasStatus_t cublasXher2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const float2 alpha, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + const float2* y_buffer, const size_t y_offset, const size_t y_inc, + float2* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasCher2(handle, triangle, + static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc), + reinterpret_cast<cuComplex*>(&a_buffer[a_offset]), a_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXher2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const double2 alpha, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + const double2* y_buffer, const size_t y_offset, const size_t y_inc, + double2* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasZher2(handle, triangle, + static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc), + reinterpret_cast<cuDoubleComplex*>(&a_buffer[a_offset]), a_ld); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for CHPR2/ZHPR2 +cublasStatus_t cublasXhpr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const float2 alpha, + const float2* x_buffer, const size_t x_offset, const size_t x_inc, + const float2* y_buffer, const size_t y_offset, const size_t y_inc, + float2* ap_buffer, const size_t ap_offset) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasChpr2(handle, triangle, + static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<const cuComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc), + reinterpret_cast<cuComplex*>(&ap_buffer[ap_offset])); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXhpr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const double2 alpha, + const double2* x_buffer, const size_t x_offset, const size_t x_inc, + const double2* y_buffer, const size_t y_offset, const size_t y_inc, + double2* ap_buffer, const size_t ap_offset) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasZhpr2(handle, triangle, + static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&x_buffer[x_offset]), static_cast<int>(x_inc), + reinterpret_cast<const cuDoubleComplex*>(&y_buffer[y_offset]), static_cast<int>(y_inc), + reinterpret_cast<cuDoubleComplex*>(&ap_buffer[ap_offset])); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for SSYR/DSYR +cublasStatus_t cublasXsyr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const float alpha, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + float* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSsyr(handle, triangle, + static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc), + &a_buffer[a_offset], a_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsyr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const double alpha, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + double* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDsyr(handle, triangle, + static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc), + &a_buffer[a_offset], a_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsyr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const half alpha, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + half* a_buffer, const size_t a_offset, const size_t a_ld) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for SSPR/DSPR +cublasStatus_t cublasXspr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const float alpha, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + float* ap_buffer, const size_t ap_offset) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSspr(handle, triangle, + static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc), + &ap_buffer[ap_offset]); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXspr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const double alpha, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + double* ap_buffer, const size_t ap_offset) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDspr(handle, triangle, + static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc), + &ap_buffer[ap_offset]); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXspr(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const half alpha, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + half* ap_buffer, const size_t ap_offset) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for SSYR2/DSYR2 +cublasStatus_t cublasXsyr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const float alpha, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + const float* y_buffer, const size_t y_offset, const size_t y_inc, + float* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSsyr2(handle, triangle, + static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc), + &a_buffer[a_offset], a_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsyr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const double alpha, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + const double* y_buffer, const size_t y_offset, const size_t y_inc, + double* a_buffer, const size_t a_offset, const size_t a_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDsyr2(handle, triangle, + static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc), + &a_buffer[a_offset], a_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsyr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const half alpha, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + const half* y_buffer, const size_t y_offset, const size_t y_inc, + half* a_buffer, const size_t a_offset, const size_t a_ld) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for SSPR2/DSPR2 +cublasStatus_t cublasXspr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const float alpha, + const float* x_buffer, const size_t x_offset, const size_t x_inc, + const float* y_buffer, const size_t y_offset, const size_t y_inc, + float* ap_buffer, const size_t ap_offset) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSspr2(handle, triangle, + static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc), + &ap_buffer[ap_offset]); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXspr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const double alpha, + const double* x_buffer, const size_t x_offset, const size_t x_inc, + const double* y_buffer, const size_t y_offset, const size_t y_inc, + double* ap_buffer, const size_t ap_offset) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDspr2(handle, triangle, + static_cast<int>(n), + &alpha, + &x_buffer[x_offset], static_cast<int>(x_inc), + &y_buffer[y_offset], static_cast<int>(y_inc), + &ap_buffer[ap_offset]); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXspr2(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, + const size_t n, + const half alpha, + const half* x_buffer, const size_t x_offset, const size_t x_inc, + const half* y_buffer, const size_t y_offset, const size_t y_inc, + half* ap_buffer, const size_t ap_offset) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// Forwards the cuBLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM +cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose, + const size_t m, const size_t n, const size_t k, + const float alpha, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + const float* b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + float* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSgemm(handle, a_transpose, b_transpose, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), + &alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + &beta, + &c_buffer[c_offset], c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose, + const size_t m, const size_t n, const size_t k, + const double alpha, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + const double* b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + double* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDgemm(handle, a_transpose, b_transpose, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), + &alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + &beta, + &c_buffer[c_offset], c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose, + const size_t m, const size_t n, const size_t k, + const float2 alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + const float2* b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + float2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasCgemm(handle, a_transpose, b_transpose, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld, + &beta_cuda, + reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose, + const size_t m, const size_t n, const size_t k, + const double2 alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + const double2* b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + double2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuDoubleComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasZgemm(handle, a_transpose, b_transpose, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld, + &beta_cuda, + reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXgemm(cublasHandle_t handle, const Layout layout, const cublasOperation_t a_transpose, const cublasOperation_t b_transpose, + const size_t m, const size_t n, const size_t k, + const half alpha, + const half* a_buffer, const size_t a_offset, const size_t a_ld, + const half* b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + half* c_buffer, const size_t c_offset, const size_t c_ld) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM +cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, + const size_t m, const size_t n, + const float alpha, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + const float* b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + float* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSsymm(handle, side, triangle, + static_cast<int>(m), static_cast<int>(n), + &alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + &beta, + &c_buffer[c_offset], c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, + const size_t m, const size_t n, + const double alpha, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + const double* b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + double* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDsymm(handle, side, triangle, + static_cast<int>(m), static_cast<int>(n), + &alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + &beta, + &c_buffer[c_offset], c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, + const size_t m, const size_t n, + const float2 alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + const float2* b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + float2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasCsymm(handle, side, triangle, + static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld, + &beta_cuda, + reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, + const size_t m, const size_t n, + const double2 alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + const double2* b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + double2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuDoubleComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasZsymm(handle, side, triangle, + static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld, + &beta_cuda, + reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsymm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, + const size_t m, const size_t n, + const half alpha, + const half* a_buffer, const size_t a_offset, const size_t a_ld, + const half* b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + half* c_buffer, const size_t c_offset, const size_t c_ld) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for CHEMM/ZHEMM +cublasStatus_t cublasXhemm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, + const size_t m, const size_t n, + const float2 alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + const float2* b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + float2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasChemm(handle, side, triangle, + static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld, + &beta_cuda, + reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXhemm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, + const size_t m, const size_t n, + const double2 alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + const double2* b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + double2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuDoubleComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasZhemm(handle, side, triangle, + static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld, + &beta_cuda, + reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK +cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, + const size_t n, const size_t k, + const float alpha, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + float* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSsyrk(handle, triangle, a_transpose, + static_cast<int>(n), static_cast<int>(k), + &alpha, + &a_buffer[a_offset], a_ld, + &beta, + &c_buffer[c_offset], c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, + const size_t n, const size_t k, + const double alpha, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + double* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDsyrk(handle, triangle, a_transpose, + static_cast<int>(n), static_cast<int>(k), + &alpha, + &a_buffer[a_offset], a_ld, + &beta, + &c_buffer[c_offset], c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, + const size_t n, const size_t k, + const float2 alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + const float2 beta, + float2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasCsyrk(handle, triangle, a_transpose, + static_cast<int>(n), static_cast<int>(k), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + &beta_cuda, + reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, + const size_t n, const size_t k, + const double2 alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + const double2 beta, + double2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuDoubleComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasZsyrk(handle, triangle, a_transpose, + static_cast<int>(n), static_cast<int>(k), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + &beta_cuda, + reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsyrk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, + const size_t n, const size_t k, + const half alpha, + const half* a_buffer, const size_t a_offset, const size_t a_ld, + const half beta, + half* c_buffer, const size_t c_offset, const size_t c_ld) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for CHERK/ZHERK +cublasStatus_t cublasXherk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, + const size_t n, const size_t k, + const float alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + float2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasCherk(handle, triangle, a_transpose, + static_cast<int>(n), static_cast<int>(k), + &alpha, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + &beta, + reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXherk(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, + const size_t n, const size_t k, + const double alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + double2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasZherk(handle, triangle, a_transpose, + static_cast<int>(n), static_cast<int>(k), + &alpha, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + &beta, + reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K +cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, + const size_t n, const size_t k, + const float alpha, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + const float* b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + float* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasSsyr2k(handle, triangle, ab_transpose, + static_cast<int>(n), static_cast<int>(k), + &alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + &beta, + &c_buffer[c_offset], c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, + const size_t n, const size_t k, + const double alpha, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + const double* b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + double* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDsyr2k(handle, triangle, ab_transpose, + static_cast<int>(n), static_cast<int>(k), + &alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + &beta, + &c_buffer[c_offset], c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, + const size_t n, const size_t k, + const float2 alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + const float2* b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + float2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasCsyr2k(handle, triangle, ab_transpose, + static_cast<int>(n), static_cast<int>(k), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld, + &beta_cuda, + reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, + const size_t n, const size_t k, + const double2 alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + const double2* b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + double2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + cuDoubleComplex beta_cuda; + beta_cuda.x = beta.real(); + beta_cuda.y = beta.imag(); + auto status = cublasZsyr2k(handle, triangle, ab_transpose, + static_cast<int>(n), static_cast<int>(k), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld, + &beta_cuda, + reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXsyr2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, + const size_t n, const size_t k, + const half alpha, + const half* a_buffer, const size_t a_offset, const size_t a_ld, + const half* b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + half* c_buffer, const size_t c_offset, const size_t c_ld) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for CHER2K/ZHER2K +cublasStatus_t cublasXher2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, + const size_t n, const size_t k, + const float2 alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + const float2* b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + float2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasCher2k(handle, triangle, ab_transpose, + static_cast<int>(n), static_cast<int>(k), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuComplex*>(&b_buffer[b_offset]), b_ld, + &beta, + reinterpret_cast<cuComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXher2k(cublasHandle_t handle, const Layout layout, const cublasFillMode_t triangle, const cublasOperation_t ab_transpose, + const size_t n, const size_t k, + const double2 alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + const double2* b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + double2* c_buffer, const size_t c_offset, const size_t c_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasZher2k(handle, triangle, ab_transpose, + static_cast<int>(n), static_cast<int>(k), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuDoubleComplex*>(&b_buffer[b_offset]), b_ld, + &beta, + reinterpret_cast<cuDoubleComplex*>(&c_buffer[c_offset]), c_ld); + cudaDeviceSynchronize(); + return status; +} + +// Forwards the cuBLAS calls for STRMM/DTRMM/CTRMM/ZTRMM +cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t m, const size_t n, + const float alpha, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + float* b_buffer, const size_t b_offset, const size_t b_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasStrmm(handle, side, triangle, a_transpose, diagonal, + static_cast<int>(m), static_cast<int>(n), + &alpha, + &a_buffer[a_offset], a_ld, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t m, const size_t n, + const double alpha, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + double* b_buffer, const size_t b_offset, const size_t b_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDtrmm(handle, side, triangle, a_transpose, diagonal, + static_cast<int>(m), static_cast<int>(n), + &alpha, + &a_buffer[a_offset], a_ld, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t m, const size_t n, + const float2 alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + float2* b_buffer, const size_t b_offset, const size_t b_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasCtrmm(handle, side, triangle, a_transpose, diagonal, + static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<cuComplex*>(&b_buffer[b_offset]), b_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t m, const size_t n, + const double2 alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + double2* b_buffer, const size_t b_offset, const size_t b_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasZtrmm(handle, side, triangle, a_transpose, diagonal, + static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<cuDoubleComplex*>(&b_buffer[b_offset]), b_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXtrmm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t m, const size_t n, + const half alpha, + const half* a_buffer, const size_t a_offset, const size_t a_ld, + half* b_buffer, const size_t b_offset, const size_t b_ld) { + return CUBLAS_STATUS_NOT_SUPPORTED; +} + +// Forwards the cuBLAS calls for STRSM/DTRSM/CTRSM/ZTRSM +cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t m, const size_t n, + const float alpha, + const float* a_buffer, const size_t a_offset, const size_t a_ld, + float* b_buffer, const size_t b_offset, const size_t b_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasStrsm(handle, side, triangle, a_transpose, diagonal, + static_cast<int>(m), static_cast<int>(n), + &alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t m, const size_t n, + const double alpha, + const double* a_buffer, const size_t a_offset, const size_t a_ld, + double* b_buffer, const size_t b_offset, const size_t b_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + auto status = cublasDtrsm(handle, side, triangle, a_transpose, diagonal, + static_cast<int>(m), static_cast<int>(n), + &alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t m, const size_t n, + const float2 alpha, + const float2* a_buffer, const size_t a_offset, const size_t a_ld, + float2* b_buffer, const size_t b_offset, const size_t b_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasCtrsm(handle, side, triangle, a_transpose, diagonal, + static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<cuComplex*>(&b_buffer[b_offset]), b_ld); + cudaDeviceSynchronize(); + return status; +} +cublasStatus_t cublasXtrsm(cublasHandle_t handle, const Layout layout, const cublasSideMode_t side, const cublasFillMode_t triangle, const cublasOperation_t a_transpose, const cublasDiagType_t diagonal, + const size_t m, const size_t n, + const double2 alpha, + const double2* a_buffer, const size_t a_offset, const size_t a_ld, + double2* b_buffer, const size_t b_offset, const size_t b_ld) { + if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; } + cuDoubleComplex alpha_cuda; + alpha_cuda.x = alpha.real(); + alpha_cuda.y = alpha.imag(); + auto status = cublasZtrsm(handle, side, triangle, a_transpose, diagonal, + static_cast<int>(m), static_cast<int>(n), + &alpha_cuda, + reinterpret_cast<const cuDoubleComplex*>(&a_buffer[a_offset]), a_ld, + reinterpret_cast<cuDoubleComplex*>(&b_buffer[b_offset]), b_ld); + cudaDeviceSynchronize(); + return status; +} + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_WRAPPER_CUBLAS_H_ +#endif diff --git a/test/wrapper_cuda.hpp b/test/wrapper_cuda.hpp new file mode 100644 index 00000000..c97ae3ef --- /dev/null +++ b/test/wrapper_cuda.hpp @@ -0,0 +1,149 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains all the CUDA related code; used only in case of testing against cuBLAS +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_WRAPPER_CUDA_H_ +#define CLBLAST_TEST_WRAPPER_CUDA_H_ + +#include <string> +#include <vector> +#include <memory> +#include <stdexcept> + +#include "utilities/utilities.hpp" + +#ifdef CLBLAST_REF_CUBLAS + #include <cuda_runtime.h> + #include <cublas_v2.h> +#endif + +namespace clblast { +// ================================================================================================= + +#ifdef CLBLAST_REF_CUBLAS + template <typename T> + void cublasSetup(Arguments<T> &args) { + cudaSetDevice(static_cast<int>(args.device_id)); + auto status = cublasCreate(reinterpret_cast<cublasHandle_t*>(&args.cublas_handle)); + if (status != CUBLAS_STATUS_SUCCESS) { + throw std::runtime_error("CUDA cublasCreate error"); + } + } +#endif + +#ifdef CLBLAST_REF_CUBLAS + template <typename T> + void cublasTeardown(Arguments<T> &args) { + auto status = cublasDestroy(reinterpret_cast<cublasHandle_t>(args.cublas_handle)); + if (status != CUBLAS_STATUS_SUCCESS) { + throw std::runtime_error("CUDA cublasDestroy error"); + } + } +#endif + +// ================================================================================================= + +// Copies data from the CUDA device to the host and frees-up the CUDA memory afterwards +#ifdef CLBLAST_REF_CUBLAS + template <typename T> + void CUDAToHost(T** buffer_cuda, std::vector<T> &buffer_host, const size_t size) { + auto status1 = cudaMemcpy( + reinterpret_cast<void*>(buffer_host.data()), + reinterpret_cast<void*>(*buffer_cuda), + size*sizeof(T), + cudaMemcpyDeviceToHost + ); + if (status1 != cudaSuccess) { + throw std::runtime_error("CUDA cudaMemcpy error with status: "+ToString(static_cast<int>(status1))); + } + auto status2 = cudaFree(*buffer_cuda); + if (status2 != cudaSuccess) { + throw std::runtime_error("CUDA cudaFree error with status: "+ToString(static_cast<int>(status2))); + } + *buffer_cuda = nullptr; +} +#else + template <typename T> void CUDAToHost(T**, const std::vector<T>&, const size_t) { } +#endif + +// Allocates space on the CUDA device and copies in data from the host +#ifdef CLBLAST_REF_CUBLAS + template <typename T> + void HostToCUDA(T** buffer_cuda, std::vector<T> &buffer_host, const size_t size) { + if (*buffer_cuda == nullptr) { + auto status1 = cudaMalloc(reinterpret_cast<void**>(buffer_cuda), size*sizeof(T)); + if (status1 != cudaSuccess) { + throw std::runtime_error("CUDA cudaMalloc error with status: "+ToString(static_cast<int>(status1))); + } + } + auto status2 = cudaMemcpy( + reinterpret_cast<void*>(*buffer_cuda), + reinterpret_cast<void*>(buffer_host.data()), + size*sizeof(T), + cudaMemcpyHostToDevice + ); + if (status2 != cudaSuccess) { + throw std::runtime_error("CUDA cudaMemcpy error with status: "+ToString(static_cast<int>(status2))); + } + } +#else + template <typename T> void HostToCUDA(T**, const std::vector<T>&, const size_t) { } +#endif + +// ================================================================================================= + +template <typename T> +struct BuffersCUDA { + T* x_vec = nullptr; + T* y_vec = nullptr; + T* a_mat = nullptr; + T* b_mat = nullptr; + T* c_mat = nullptr; + T* ap_mat = nullptr; + T* scalar = nullptr; +}; + +template <typename T, typename U> +void CUDAToHost(const Arguments<U> &args, BuffersCUDA<T> &buffers, BuffersHost<T> &buffers_host, + const std::vector<std::string> &names) { + for (auto &name: names) { + if (name == kBufVecX) { buffers_host.x_vec = std::vector<T>(args.x_size, static_cast<T>(0)); CUDAToHost(&buffers.x_vec, buffers_host.x_vec, args.x_size); } + else if (name == kBufVecY) { buffers_host.y_vec = std::vector<T>(args.y_size, static_cast<T>(0)); CUDAToHost(&buffers.y_vec, buffers_host.y_vec, args.y_size); } + else if (name == kBufMatA) { buffers_host.a_mat = std::vector<T>(args.a_size, static_cast<T>(0)); CUDAToHost(&buffers.a_mat, buffers_host.a_mat, args.a_size); } + else if (name == kBufMatB) { buffers_host.b_mat = std::vector<T>(args.b_size, static_cast<T>(0)); CUDAToHost(&buffers.b_mat, buffers_host.b_mat, args.b_size); } + else if (name == kBufMatC) { buffers_host.c_mat = std::vector<T>(args.c_size, static_cast<T>(0)); CUDAToHost(&buffers.c_mat, buffers_host.c_mat, args.c_size); } + else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector<T>(args.ap_size, static_cast<T>(0)); CUDAToHost(&buffers.ap_mat, buffers_host.ap_mat, args.ap_size); } + else if (name == kBufScalar) { buffers_host.scalar = std::vector<T>(args.scalar_size, static_cast<T>(0)); CUDAToHost(&buffers.scalar, buffers_host.scalar, args.scalar_size); } + else { throw std::runtime_error("Invalid buffer name"); } + } +} + +template <typename T, typename U> +void HostToCUDA(const Arguments<U> &args, BuffersCUDA<T> &buffers, BuffersHost<T> &buffers_host, + const std::vector<std::string> &names) { + for (auto &name: names) { + if (name == kBufVecX) { HostToCUDA(&buffers.x_vec, buffers_host.x_vec, args.x_size); } + else if (name == kBufVecY) { HostToCUDA(&buffers.y_vec, buffers_host.y_vec, args.y_size); } + else if (name == kBufMatA) { HostToCUDA(&buffers.a_mat, buffers_host.a_mat, args.a_size); } + else if (name == kBufMatB) { HostToCUDA(&buffers.b_mat, buffers_host.b_mat, args.b_size); } + else if (name == kBufMatC) { HostToCUDA(&buffers.c_mat, buffers_host.c_mat, args.c_size); } + else if (name == kBufMatAP) { HostToCUDA(&buffers.ap_mat, buffers_host.ap_mat, args.ap_size); } + else if (name == kBufScalar) { HostToCUDA(&buffers.scalar, buffers_host.scalar, args.scalar_size); } + else { throw std::runtime_error("Invalid buffer name"); } + } +} + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_WRAPPER_CUDA_H_ +#endif |