diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-05-30 11:11:28 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-05-30 11:11:28 +0200 |
commit | 61105e38100d323ea270f2cbee0a824d401eaa77 (patch) | |
tree | a6f8af9f6e75b57870bfce119f037093a46d2e9c /test | |
parent | 182d2cffa163688e2ae08d5d526f8eb63914b6ac (diff) | |
parent | 03182f9d07533f795a498936391da744d982e8e2 (diff) |
Merge branch 'half_precision' into development
Diffstat (limited to 'test')
99 files changed, 1845 insertions, 860 deletions
diff --git a/test/correctness/routines/level1/xamax.cc b/test/correctness/routines/level1/xamax.cc index ade09e7a..648abaa6 100644 --- a/test/correctness/routines/level1/xamax.cc +++ b/test/correctness/routines/level1/xamax.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXamax<double>, double, double>(argc, argv, true, "iDAMAX"); clblast::RunTests<clblast::TestXamax<float2>, float2, float2>(argc, argv, true, "iCAMAX"); clblast::RunTests<clblast::TestXamax<double2>, double2, double2>(argc, argv, true, "iZAMAX"); + clblast::RunTests<clblast::TestXamax<half>, half, half>(argc, argv, true, "iHAMAX"); return 0; } diff --git a/test/correctness/routines/level1/xasum.cc b/test/correctness/routines/level1/xasum.cc index 5ec20596..d3b036c7 100644 --- a/test/correctness/routines/level1/xasum.cc +++ b/test/correctness/routines/level1/xasum.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXasum<double>, double, double>(argc, argv, true, "DASUM"); clblast::RunTests<clblast::TestXasum<float2>, float2, float2>(argc, argv, true, "ScASUM"); clblast::RunTests<clblast::TestXasum<double2>, double2, double2>(argc, argv, true, "DzASUM"); + clblast::RunTests<clblast::TestXasum<half>, half, half>(argc, argv, true, "HASUM"); return 0; } diff --git a/test/correctness/routines/level1/xaxpy.cc b/test/correctness/routines/level1/xaxpy.cc index 746e0001..04f4c128 100644 --- a/test/correctness/routines/level1/xaxpy.cc +++ b/test/correctness/routines/level1/xaxpy.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXaxpy<double>, double, double>(argc, argv, true, "DAXPY"); clblast::RunTests<clblast::TestXaxpy<float2>, float2, float2>(argc, argv, true, "CAXPY"); clblast::RunTests<clblast::TestXaxpy<double2>, double2, double2>(argc, argv, true, "ZAXPY"); + clblast::RunTests<clblast::TestXaxpy<half>, half, half>(argc, argv, true, "HAXPY"); return 0; } diff --git a/test/correctness/routines/level1/xcopy.cc b/test/correctness/routines/level1/xcopy.cc index 3e16ffc6..316c6982 100644 --- a/test/correctness/routines/level1/xcopy.cc +++ b/test/correctness/routines/level1/xcopy.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXcopy<double>, double, double>(argc, argv, true, "DCOPY"); clblast::RunTests<clblast::TestXcopy<float2>, float2, float2>(argc, argv, true, "CCOPY"); clblast::RunTests<clblast::TestXcopy<double2>, double2, double2>(argc, argv, true, "ZCOPY"); + clblast::RunTests<clblast::TestXcopy<half>, half, half>(argc, argv, true, "HCOPY"); return 0; } diff --git a/test/correctness/routines/level1/xdot.cc b/test/correctness/routines/level1/xdot.cc index 5ea105e0..72dc9d5e 100644 --- a/test/correctness/routines/level1/xdot.cc +++ b/test/correctness/routines/level1/xdot.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXdot<float>, float, float>(argc, argv, false, "SDOT"); clblast::RunTests<clblast::TestXdot<double>, double, double>(argc, argv, true, "DDOT"); + clblast::RunTests<clblast::TestXdot<half>, half, half>(argc, argv, true, "HDOT"); return 0; } diff --git a/test/correctness/routines/level1/xnrm2.cc b/test/correctness/routines/level1/xnrm2.cc index 97fb0ad6..0fe8dc33 100644 --- a/test/correctness/routines/level1/xnrm2.cc +++ b/test/correctness/routines/level1/xnrm2.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXnrm2<double>, double, double>(argc, argv, true, "DNRM2"); clblast::RunTests<clblast::TestXnrm2<float2>, float2, float2>(argc, argv, true, "ScNRM2"); clblast::RunTests<clblast::TestXnrm2<double2>, double2, double2>(argc, argv, true, "DzNRM2"); + clblast::RunTests<clblast::TestXnrm2<half>, half, half>(argc, argv, true, "HNRM2"); return 0; } diff --git a/test/correctness/routines/level1/xscal.cc b/test/correctness/routines/level1/xscal.cc index 4d138fad..9146e5ce 100644 --- a/test/correctness/routines/level1/xscal.cc +++ b/test/correctness/routines/level1/xscal.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXscal<double>, double, double>(argc, argv, true, "DSCAL"); clblast::RunTests<clblast::TestXscal<float2>, float2, float2>(argc, argv, true, "CSCAL"); clblast::RunTests<clblast::TestXscal<double2>, double2, double2>(argc, argv, true, "ZSCAL"); + clblast::RunTests<clblast::TestXscal<half>, half, half>(argc, argv, true, "HSCAL"); return 0; } diff --git a/test/correctness/routines/level1/xswap.cc b/test/correctness/routines/level1/xswap.cc index 38f110f7..636a5b0f 100644 --- a/test/correctness/routines/level1/xswap.cc +++ b/test/correctness/routines/level1/xswap.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXswap<double>, double, double>(argc, argv, true, "DSWAP"); clblast::RunTests<clblast::TestXswap<float2>, float2, float2>(argc, argv, true, "CSWAP"); clblast::RunTests<clblast::TestXswap<double2>, double2, double2>(argc, argv, true, "ZSWAP"); + clblast::RunTests<clblast::TestXswap<half>, half, half>(argc, argv, true, "HSWAP"); return 0; } diff --git a/test/correctness/routines/level2/xgbmv.cc b/test/correctness/routines/level2/xgbmv.cc index b28c5978..528a3325 100644 --- a/test/correctness/routines/level2/xgbmv.cc +++ b/test/correctness/routines/level2/xgbmv.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXgbmv<double>, double, double>(argc, argv, true, "DGBMV"); clblast::RunTests<clblast::TestXgbmv<float2>, float2, float2>(argc, argv, true, "CGBMV"); clblast::RunTests<clblast::TestXgbmv<double2>, double2, double2>(argc, argv, true, "ZGBMV"); + clblast::RunTests<clblast::TestXgbmv<half>, half, half>(argc, argv, true, "HGBMV"); return 0; } diff --git a/test/correctness/routines/level2/xgemv.cc b/test/correctness/routines/level2/xgemv.cc index 14eb74d1..fc1cf3eb 100644 --- a/test/correctness/routines/level2/xgemv.cc +++ b/test/correctness/routines/level2/xgemv.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXgemv<double>, double, double>(argc, argv, true, "DGEMV"); clblast::RunTests<clblast::TestXgemv<float2>, float2, float2>(argc, argv, true, "CGEMV"); clblast::RunTests<clblast::TestXgemv<double2>, double2, double2>(argc, argv, true, "ZGEMV"); + clblast::RunTests<clblast::TestXgemv<half>, half, half>(argc, argv, true, "HGEMV"); return 0; } diff --git a/test/correctness/routines/level2/xger.cc b/test/correctness/routines/level2/xger.cc index c37a5c41..c3c33ae6 100644 --- a/test/correctness/routines/level2/xger.cc +++ b/test/correctness/routines/level2/xger.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXger<float>, float, float>(argc, argv, false, "SGER"); clblast::RunTests<clblast::TestXger<double>, double, double>(argc, argv, true, "DGER"); + clblast::RunTests<clblast::TestXger<half>, half, half>(argc, argv, true, "HGER"); return 0; } diff --git a/test/correctness/routines/level2/xsbmv.cc b/test/correctness/routines/level2/xsbmv.cc index 212e2c3a..c2effcc2 100644 --- a/test/correctness/routines/level2/xsbmv.cc +++ b/test/correctness/routines/level2/xsbmv.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXsbmv<float>, float, float>(argc, argv, false, "SSBMV"); clblast::RunTests<clblast::TestXsbmv<double>, double, double>(argc, argv, true, "DSBMV"); + clblast::RunTests<clblast::TestXsbmv<half>, half, half>(argc, argv, true, "HSBMV"); return 0; } diff --git a/test/correctness/routines/level2/xspmv.cc b/test/correctness/routines/level2/xspmv.cc index dc833024..4142636d 100644 --- a/test/correctness/routines/level2/xspmv.cc +++ b/test/correctness/routines/level2/xspmv.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXspmv<float>, float, float>(argc, argv, false, "SSPMV"); clblast::RunTests<clblast::TestXspmv<double>, double, double>(argc, argv, true, "DSPMV"); + clblast::RunTests<clblast::TestXspmv<half>, half, half>(argc, argv, true, "HSPMV"); return 0; } diff --git a/test/correctness/routines/level2/xspr.cc b/test/correctness/routines/level2/xspr.cc index a0104dd4..c068b448 100644 --- a/test/correctness/routines/level2/xspr.cc +++ b/test/correctness/routines/level2/xspr.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXspr<float>, float, float>(argc, argv, false, "SSPR"); clblast::RunTests<clblast::TestXspr<double>, double, double>(argc, argv, true, "DSPR"); + clblast::RunTests<clblast::TestXspr<half>, half, half>(argc, argv, true, "HSPR"); return 0; } diff --git a/test/correctness/routines/level2/xspr2.cc b/test/correctness/routines/level2/xspr2.cc index 5fe5827f..904870d5 100644 --- a/test/correctness/routines/level2/xspr2.cc +++ b/test/correctness/routines/level2/xspr2.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXspr2<float>, float, float>(argc, argv, false, "SSPR2"); clblast::RunTests<clblast::TestXspr2<double>, double, double>(argc, argv, true, "DSPR2"); + clblast::RunTests<clblast::TestXspr2<half>, half, half>(argc, argv, true, "HSPR2"); return 0; } diff --git a/test/correctness/routines/level2/xsymv.cc b/test/correctness/routines/level2/xsymv.cc index 6224739f..eb9b6eb7 100644 --- a/test/correctness/routines/level2/xsymv.cc +++ b/test/correctness/routines/level2/xsymv.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXsymv<float>, float, float>(argc, argv, false, "SSYMV"); clblast::RunTests<clblast::TestXsymv<double>, double, double>(argc, argv, true, "DSYMV"); + clblast::RunTests<clblast::TestXsymv<half>, half, half>(argc, argv, true, "HSYMV"); return 0; } diff --git a/test/correctness/routines/level2/xsyr.cc b/test/correctness/routines/level2/xsyr.cc index a47b918f..eccf95e0 100644 --- a/test/correctness/routines/level2/xsyr.cc +++ b/test/correctness/routines/level2/xsyr.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXsyr<float>, float, float>(argc, argv, false, "SSYR"); clblast::RunTests<clblast::TestXsyr<double>, double, double>(argc, argv, true, "DSYR"); + clblast::RunTests<clblast::TestXsyr<half>, half, half>(argc, argv, true, "HSYR"); return 0; } diff --git a/test/correctness/routines/level2/xsyr2.cc b/test/correctness/routines/level2/xsyr2.cc index 1743632c..46c939d2 100644 --- a/test/correctness/routines/level2/xsyr2.cc +++ b/test/correctness/routines/level2/xsyr2.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXsyr2<float>, float, float>(argc, argv, false, "SSYR2"); clblast::RunTests<clblast::TestXsyr2<double>, double, double>(argc, argv, true, "DSYR2"); + clblast::RunTests<clblast::TestXsyr2<half>, half, half>(argc, argv, true, "HSYR2"); return 0; } diff --git a/test/correctness/routines/level2/xtbmv.cc b/test/correctness/routines/level2/xtbmv.cc index d3bbbade..252abdc4 100644 --- a/test/correctness/routines/level2/xtbmv.cc +++ b/test/correctness/routines/level2/xtbmv.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXtbmv<double>, double, double>(argc, argv, true, "DTBMV"); clblast::RunTests<clblast::TestXtbmv<float2>, float2, float2>(argc, argv, true, "CTBMV"); clblast::RunTests<clblast::TestXtbmv<double2>, double2, double2>(argc, argv, true, "ZTBMV"); + clblast::RunTests<clblast::TestXtbmv<half>, half, half>(argc, argv, true, "HTBMV"); return 0; } diff --git a/test/correctness/routines/level2/xtpmv.cc b/test/correctness/routines/level2/xtpmv.cc index 95489a65..b8776faa 100644 --- a/test/correctness/routines/level2/xtpmv.cc +++ b/test/correctness/routines/level2/xtpmv.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXtpmv<double>, double, double>(argc, argv, true, "DTPMV"); clblast::RunTests<clblast::TestXtpmv<float2>, float2, float2>(argc, argv, true, "CTPMV"); clblast::RunTests<clblast::TestXtpmv<double2>, double2, double2>(argc, argv, true, "ZTPMV"); + clblast::RunTests<clblast::TestXtpmv<half>, half, half>(argc, argv, true, "HTPMV"); return 0; } diff --git a/test/correctness/routines/level2/xtrmv.cc b/test/correctness/routines/level2/xtrmv.cc index ca50af88..256fe900 100644 --- a/test/correctness/routines/level2/xtrmv.cc +++ b/test/correctness/routines/level2/xtrmv.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXtrmv<double>, double, double>(argc, argv, true, "DTRMV"); clblast::RunTests<clblast::TestXtrmv<float2>, float2, float2>(argc, argv, true, "CTRMV"); clblast::RunTests<clblast::TestXtrmv<double2>, double2, double2>(argc, argv, true, "ZTRMV"); + clblast::RunTests<clblast::TestXtrmv<half>, half, half>(argc, argv, true, "HTRMV"); return 0; } diff --git a/test/correctness/routines/level3/xgemm.cc b/test/correctness/routines/level3/xgemm.cc index 632724ed..f8c8a891 100644 --- a/test/correctness/routines/level3/xgemm.cc +++ b/test/correctness/routines/level3/xgemm.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXgemm<double>, double, double>(argc, argv, true, "DGEMM"); clblast::RunTests<clblast::TestXgemm<float2>, float2, float2>(argc, argv, true, "CGEMM"); clblast::RunTests<clblast::TestXgemm<double2>, double2, double2>(argc, argv, true, "ZGEMM"); + clblast::RunTests<clblast::TestXgemm<half>, half, half>(argc, argv, true, "HGEMM"); return 0; } diff --git a/test/correctness/routines/level3/xsymm.cc b/test/correctness/routines/level3/xsymm.cc index 046fca16..c29f03dd 100644 --- a/test/correctness/routines/level3/xsymm.cc +++ b/test/correctness/routines/level3/xsymm.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXsymm<double>, double, double>(argc, argv, true, "DSYMM"); clblast::RunTests<clblast::TestXsymm<float2>, float2, float2>(argc, argv, true, "CSYMM"); clblast::RunTests<clblast::TestXsymm<double2>, double2, double2>(argc, argv, true, "ZSYMM"); + clblast::RunTests<clblast::TestXsymm<half>, half, half>(argc, argv, true, "HSYMM"); return 0; } diff --git a/test/correctness/routines/level3/xsyr2k.cc b/test/correctness/routines/level3/xsyr2k.cc index db2b83d9..9f9c87d8 100644 --- a/test/correctness/routines/level3/xsyr2k.cc +++ b/test/correctness/routines/level3/xsyr2k.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXsyr2k<double>, double, double>(argc, argv, true, "DSYR2K"); clblast::RunTests<clblast::TestXsyr2k<float2>, float2, float2>(argc, argv, true, "CSYR2K"); clblast::RunTests<clblast::TestXsyr2k<double2>, double2, double2>(argc, argv, true, "ZSYR2K"); + clblast::RunTests<clblast::TestXsyr2k<half>, half, half>(argc, argv, true, "HSYR2K"); return 0; } diff --git a/test/correctness/routines/level3/xsyrk.cc b/test/correctness/routines/level3/xsyrk.cc index 3dad3535..12343074 100644 --- a/test/correctness/routines/level3/xsyrk.cc +++ b/test/correctness/routines/level3/xsyrk.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXsyrk<double>, double, double>(argc, argv, true, "DSYRK"); clblast::RunTests<clblast::TestXsyrk<float2>, float2, float2>(argc, argv, true, "CSYRK"); clblast::RunTests<clblast::TestXsyrk<double2>, double2, double2>(argc, argv, true, "ZSYRK"); + clblast::RunTests<clblast::TestXsyrk<half>, half, half>(argc, argv, true, "HSYRK"); return 0; } diff --git a/test/correctness/routines/level3/xtrmm.cc b/test/correctness/routines/level3/xtrmm.cc index 2d843e3e..aca73f0d 100644 --- a/test/correctness/routines/level3/xtrmm.cc +++ b/test/correctness/routines/level3/xtrmm.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXtrmm<double>, double, double>(argc, argv, true, "DTRMM"); clblast::RunTests<clblast::TestXtrmm<float2>, float2, float2>(argc, argv, true, "CTRMM"); clblast::RunTests<clblast::TestXtrmm<double2>, double2, double2>(argc, argv, true, "ZTRMM"); + clblast::RunTests<clblast::TestXtrmm<half>, half, half>(argc, argv, true, "HTRMM"); return 0; } diff --git a/test/correctness/routines/level3/xtrsm.cc b/test/correctness/routines/level3/xtrsm.cc index b5f5045e..b050269a 100644 --- a/test/correctness/routines/level3/xtrsm.cc +++ b/test/correctness/routines/level3/xtrsm.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests<clblast::TestXtrsm<double>, double, double>(argc, argv, true, "DTRSM"); clblast::RunTests<clblast::TestXtrsm<float2>, float2, float2>(argc, argv, true, "CTRSM"); clblast::RunTests<clblast::TestXtrsm<double2>, double2, double2>(argc, argv, true, "ZTRSM"); + clblast::RunTests<clblast::TestXtrsm<half>, half, half>(argc, argv, true, "HTRSM"); return 0; } diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index e70c0361..50871402 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // The transpose-options to test with (data-type dependent) +template <> const std::vector<Transpose> TestBlas<half,half>::kTransposes = {Transpose::kNo, Transpose::kYes}; template <> const std::vector<Transpose> TestBlas<float,float>::kTransposes = {Transpose::kNo, Transpose::kYes}; template <> const std::vector<Transpose> TestBlas<double,double>::kTransposes = {Transpose::kNo, Transpose::kYes}; template <> const std::vector<Transpose> TestBlas<float2,float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate}; @@ -147,10 +148,8 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st if (verbose_) { if (get_id2_(args) == 1) { fprintf(stdout, "\n Error at index %zu: ", id1); } else { fprintf(stdout, "\n Error at %zu,%zu: ", id1, id2); } - std::cout << result1[index]; - fprintf(stdout, " (reference) versus "); - std::cout << result2[index]; - fprintf(stdout, " (CLBlast)"); + fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str()); + fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str()); } } } @@ -171,6 +170,7 @@ template <typename T, typename U> void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name) { if (!PrecisionSupported<T>(device_)) { return; } if (!compare_clblas_) { return; } + if (std::is_same<T, half>::value) { return; } TestStart("invalid buffer sizes", name); // Iterates over all the to-be-tested combinations of arguments @@ -222,6 +222,7 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st // ================================================================================================= // Compiles the templated class +template class TestBlas<half, half>; template class TestBlas<float, float>; template class TestBlas<double, double>; template class TestBlas<float2, float2>; diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index 85ae7091..5b603585 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -351,11 +351,11 @@ bool TestSimilarity(const T val1, const T val2) { } } -// Compiles the default case for non-complex data-types +// Compiles the default case for standard data-types template bool TestSimilarity<float>(const float, const float); template bool TestSimilarity<double>(const double, const double); -// Specialisations for complex data-types +// Specialisations for non-standard data-types template <> bool TestSimilarity(const float2 val1, const float2 val2) { auto real = TestSimilarity(val1.real(), val2.real()); @@ -368,6 +368,10 @@ bool TestSimilarity(const double2 val1, const double2 val2) { auto imag = TestSimilarity(val1.imag(), val2.imag()); return (real && imag); } +template <> +bool TestSimilarity(const half val1, const half val2) { + return TestSimilarity(HalfToFloat(val1), HalfToFloat(val2)); +} // ================================================================================================= @@ -389,10 +393,15 @@ template <> const std::vector<double2> GetExampleScalars(const bool full_test) { if (full_test) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; } else { return {{2.42, 3.14}}; } } +template <> const std::vector<half> GetExampleScalars(const bool full_test) { + if (full_test) { return {FloatToHalf(0.0f), FloatToHalf(1.0f), FloatToHalf(3.14f)}; } + else { return {FloatToHalf(3.14f)}; } +} // ================================================================================================= // Compiles the templated class +template class Tester<half, half>; template class Tester<float, float>; template class Tester<double, double>; template class Tester<float2, float2>; diff --git a/test/performance/client.cc b/test/performance/client.cc index 9aaf1e4e..5a7226df 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -116,6 +116,17 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric // which is thus always displayed (unless silence is specified). if (!args.silent) { fprintf(stdout, "%s\n", help.c_str()); } + // Comparison against clBLAS or a CPU BLAS library is not supported in case of half-precision + if (args.precision == Precision::kHalf) { + if (args.compare_clblas != 0 || args.compare_cblas != 0) { + if (!args.silent) { + fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for half-precision\n\n"); + } + } + args.compare_clblas = 0; + args.compare_cblas = 0; + } + // Returns the arguments return args; } @@ -339,6 +350,7 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, // ================================================================================================= // Compiles the templated class +template class Client<half,half>; template class Client<float,float>; template class Client<double,double>; template class Client<float2,float2>; diff --git a/test/performance/routines/level1/xamax.cc b/test/performance/routines/level1/xamax.cc index 85caa483..4af1f1c0 100644 --- a/test/performance/routines/level1/xamax.cc +++ b/test/performance/routines/level1/xamax.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXamax<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXamax<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xasum.cc b/test/performance/routines/level1/xasum.cc index 2680966e..8e098890 100644 --- a/test/performance/routines/level1/xasum.cc +++ b/test/performance/routines/level1/xasum.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXasum<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXasum<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xaxpy.cc b/test/performance/routines/level1/xaxpy.cc index b423bc3a..b48c290d 100644 --- a/test/performance/routines/level1/xaxpy.cc +++ b/test/performance/routines/level1/xaxpy.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXaxpy<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXaxpy<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xcopy.cc b/test/performance/routines/level1/xcopy.cc index c04c6c1c..b7c60f0f 100644 --- a/test/performance/routines/level1/xcopy.cc +++ b/test/performance/routines/level1/xcopy.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXcopy<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXcopy<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xdot.cc b/test/performance/routines/level1/xdot.cc index f4616464..3edf2590 100644 --- a/test/performance/routines/level1/xdot.cc +++ b/test/performance/routines/level1/xdot.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXdot<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXdot<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xnrm2.cc b/test/performance/routines/level1/xnrm2.cc index db6ec9ad..f167df95 100644 --- a/test/performance/routines/level1/xnrm2.cc +++ b/test/performance/routines/level1/xnrm2.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXnrm2<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXnrm2<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xscal.cc b/test/performance/routines/level1/xscal.cc index bd38f43e..35e21ba8 100644 --- a/test/performance/routines/level1/xscal.cc +++ b/test/performance/routines/level1/xscal.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXscal<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXscal<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xswap.cc b/test/performance/routines/level1/xswap.cc index 112641d3..4791d4c3 100644 --- a/test/performance/routines/level1/xswap.cc +++ b/test/performance/routines/level1/xswap.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXswap<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXswap<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xgbmv.cc b/test/performance/routines/level2/xgbmv.cc index b050184d..be4056de 100644 --- a/test/performance/routines/level2/xgbmv.cc +++ b/test/performance/routines/level2/xgbmv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXgbmv<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXgbmv<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xgemv.cc b/test/performance/routines/level2/xgemv.cc index 51ab9a10..50e6225a 100644 --- a/test/performance/routines/level2/xgemv.cc +++ b/test/performance/routines/level2/xgemv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXgemv<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXgemv<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xger.cc b/test/performance/routines/level2/xger.cc index 2d956346..b1b5a268 100644 --- a/test/performance/routines/level2/xger.cc +++ b/test/performance/routines/level2/xger.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXger<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXger<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xsbmv.cc b/test/performance/routines/level2/xsbmv.cc index eabab3b7..5fb6e8c0 100644 --- a/test/performance/routines/level2/xsbmv.cc +++ b/test/performance/routines/level2/xsbmv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXsbmv<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXsbmv<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xspmv.cc b/test/performance/routines/level2/xspmv.cc index 2a9ef925..e0ee2075 100644 --- a/test/performance/routines/level2/xspmv.cc +++ b/test/performance/routines/level2/xspmv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXspmv<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXspmv<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xspr.cc b/test/performance/routines/level2/xspr.cc index 84331d74..19651679 100644 --- a/test/performance/routines/level2/xspr.cc +++ b/test/performance/routines/level2/xspr.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXspr<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXspr<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xspr2.cc b/test/performance/routines/level2/xspr2.cc index c42009a1..8745c004 100644 --- a/test/performance/routines/level2/xspr2.cc +++ b/test/performance/routines/level2/xspr2.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXspr2<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXspr2<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xsymv.cc b/test/performance/routines/level2/xsymv.cc index 3f72fe77..42de1ed5 100644 --- a/test/performance/routines/level2/xsymv.cc +++ b/test/performance/routines/level2/xsymv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXsymv<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXsymv<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xsyr.cc b/test/performance/routines/level2/xsyr.cc index 6b31d3a9..310bfb5e 100644 --- a/test/performance/routines/level2/xsyr.cc +++ b/test/performance/routines/level2/xsyr.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXsyr<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXsyr<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xsyr2.cc b/test/performance/routines/level2/xsyr2.cc index 0ad59d2d..bbeed3db 100644 --- a/test/performance/routines/level2/xsyr2.cc +++ b/test/performance/routines/level2/xsyr2.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXsyr2<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXsyr2<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xtbmv.cc b/test/performance/routines/level2/xtbmv.cc index a3297f34..24eec61f 100644 --- a/test/performance/routines/level2/xtbmv.cc +++ b/test/performance/routines/level2/xtbmv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXtbmv<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXtbmv<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xtpmv.cc b/test/performance/routines/level2/xtpmv.cc index 72477f2d..2f2487f8 100644 --- a/test/performance/routines/level2/xtpmv.cc +++ b/test/performance/routines/level2/xtpmv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXtpmv<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXtpmv<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xtrmv.cc b/test/performance/routines/level2/xtrmv.cc index 894a7952..3f23afd1 100644 --- a/test/performance/routines/level2/xtrmv.cc +++ b/test/performance/routines/level2/xtrmv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXtrmv<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXtrmv<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xgemm.cc b/test/performance/routines/level3/xgemm.cc index 91897ee1..8e48dc3a 100644 --- a/test/performance/routines/level3/xgemm.cc +++ b/test/performance/routines/level3/xgemm.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXgemm<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXgemm<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xsymm.cc b/test/performance/routines/level3/xsymm.cc index e0feadd1..7eac5537 100644 --- a/test/performance/routines/level3/xsymm.cc +++ b/test/performance/routines/level3/xsymm.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXsymm<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXsymm<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xsyr2k.cc b/test/performance/routines/level3/xsyr2k.cc index 4a82ddc4..49d00f34 100644 --- a/test/performance/routines/level3/xsyr2k.cc +++ b/test/performance/routines/level3/xsyr2k.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXsyr2k<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXsyr2k<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xsyrk.cc b/test/performance/routines/level3/xsyrk.cc index 70f61322..ad0a06b4 100644 --- a/test/performance/routines/level3/xsyrk.cc +++ b/test/performance/routines/level3/xsyrk.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXsyrk<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXsyrk<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xtrmm.cc b/test/performance/routines/level3/xtrmm.cc index 6f6041e4..92526844 100644 --- a/test/performance/routines/level3/xtrmm.cc +++ b/test/performance/routines/level3/xtrmm.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXtrmm<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXtrmm<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xtrsm.cc b/test/performance/routines/level3/xtrsm.cc index 76ef255a..08e4b4a9 100644 --- a/test/performance/routines/level3/xtrsm.cc +++ b/test/performance/routines/level3/xtrsm.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXtrsm<half>, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient<clblast::TestXtrsm<float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/routines/level1/xamax.h b/test/routines/level1/xamax.h index 7b404dc3..12b031bc 100644 --- a/test/routines/level1/xamax.h +++ b/test/routines/level1/xamax.h @@ -86,8 +86,8 @@ class TestXamax { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXamax<T>(args.n, - buffers.scalar(), args.imax_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.scalar, args.imax_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level1/xasum.h b/test/routines/level1/xasum.h index 6eae3c83..eb83817b 100644 --- a/test/routines/level1/xasum.h +++ b/test/routines/level1/xasum.h @@ -86,8 +86,8 @@ class TestXasum { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXasum<T>(args.n, - buffers.scalar(), args.asum_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.scalar, args.asum_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.h index 8f72f570..c241da91 100644 --- a/test/routines/level1/xaxpy.h +++ b/test/routines/level1/xaxpy.h @@ -87,8 +87,8 @@ class TestXaxpy { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXaxpy(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level1/xcopy.h b/test/routines/level1/xcopy.h index 0527ca6a..a1ff06ce 100644 --- a/test/routines/level1/xcopy.h +++ b/test/routines/level1/xcopy.h @@ -86,8 +86,8 @@ class TestXcopy { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXcopy<T>(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level1/xdot.h b/test/routines/level1/xdot.h index d1c34c0f..0bbc93d5 100644 --- a/test/routines/level1/xdot.h +++ b/test/routines/level1/xdot.h @@ -91,9 +91,9 @@ class TestXdot { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdot<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level1/xdotc.h b/test/routines/level1/xdotc.h index a2742cb0..e1cc1854 100644 --- a/test/routines/level1/xdotc.h +++ b/test/routines/level1/xdotc.h @@ -91,9 +91,9 @@ class TestXdotc { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotc<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level1/xdotu.h b/test/routines/level1/xdotu.h index 06ce979e..558257cc 100644 --- a/test/routines/level1/xdotu.h +++ b/test/routines/level1/xdotu.h @@ -91,9 +91,9 @@ class TestXdotu { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotu<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level1/xnrm2.h b/test/routines/level1/xnrm2.h index d8a0de4e..19074ca2 100644 --- a/test/routines/level1/xnrm2.h +++ b/test/routines/level1/xnrm2.h @@ -86,8 +86,8 @@ class TestXnrm2 { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXnrm2<T>(args.n, - buffers.scalar(), args.nrm2_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.scalar, args.nrm2_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level1/xscal.h b/test/routines/level1/xscal.h index 35855dbd..84d14ac7 100644 --- a/test/routines/level1/xscal.h +++ b/test/routines/level1/xscal.h @@ -82,7 +82,7 @@ class TestXscal { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXscal(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level1/xswap.h b/test/routines/level1/xswap.h index ae69d3be..e870b602 100644 --- a/test/routines/level1/xswap.h +++ b/test/routines/level1/xswap.h @@ -86,8 +86,8 @@ class TestXswap { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXswap<T>(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h index c88cdf2a..c777ff73 100644 --- a/test/routines/level2/xgbmv.h +++ b/test/routines/level2/xgbmv.h @@ -102,9 +102,9 @@ class TestXgbmv { auto status = clblasXgbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), args.m, args.n, args.kl, args.ku, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h index cf63d55f..f8a7e1d0 100644 --- a/test/routines/level2/xgemv.h +++ b/test/routines/level2/xgemv.h @@ -102,9 +102,9 @@ class TestXgemv { auto status = clblasXgemv(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xger.h b/test/routines/level2/xger.h index ae142e2e..e0d1fe49 100644 --- a/test/routines/level2/xger.h +++ b/test/routines/level2/xger.h @@ -97,9 +97,9 @@ class TestXger { auto event = cl_event{}; auto status = clblasXger(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xgerc.h b/test/routines/level2/xgerc.h index b236aef6..7449146b 100644 --- a/test/routines/level2/xgerc.h +++ b/test/routines/level2/xgerc.h @@ -97,9 +97,9 @@ class TestXgerc { auto event = cl_event{}; auto status = clblasXgerc(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xgeru.h b/test/routines/level2/xgeru.h index 3d3fa439..07837657 100644 --- a/test/routines/level2/xgeru.h +++ b/test/routines/level2/xgeru.h @@ -97,9 +97,9 @@ class TestXgeru { auto event = cl_event{}; auto status = clblasXgeru(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h index 4098639a..73194975 100644 --- a/test/routines/level2/xhbmv.h +++ b/test/routines/level2/xhbmv.h @@ -96,9 +96,9 @@ class TestXhbmv { auto status = clblasXhbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xhemv.h b/test/routines/level2/xhemv.h index 5652872d..aabbf14a 100644 --- a/test/routines/level2/xhemv.h +++ b/test/routines/level2/xhemv.h @@ -96,9 +96,9 @@ class TestXhemv { auto status = clblasXhemv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xher.h b/test/routines/level2/xher.h index 3bbf0887..1294832c 100644 --- a/test/routines/level2/xher.h +++ b/test/routines/level2/xher.h @@ -91,8 +91,8 @@ class TestXher { auto status = clblasXher(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xher2.h b/test/routines/level2/xher2.h index dc7fbe73..5e90174d 100644 --- a/test/routines/level2/xher2.h +++ b/test/routines/level2/xher2.h @@ -96,9 +96,9 @@ class TestXher2 { auto status = clblasXher2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h index df5a90ee..8face6b6 100644 --- a/test/routines/level2/xhpmv.h +++ b/test/routines/level2/xhpmv.h @@ -96,9 +96,9 @@ class TestXhpmv { auto status = clblasXhpmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xhpr.h b/test/routines/level2/xhpr.h index 0db11db0..63cab31f 100644 --- a/test/routines/level2/xhpr.h +++ b/test/routines/level2/xhpr.h @@ -91,8 +91,8 @@ class TestXhpr { auto status = clblasXhpr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xhpr2.h b/test/routines/level2/xhpr2.h index e1e5b4c5..64d205a0 100644 --- a/test/routines/level2/xhpr2.h +++ b/test/routines/level2/xhpr2.h @@ -96,9 +96,9 @@ class TestXhpr2 { auto status = clblasXhpr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h index fce88f4c..3f1446c8 100644 --- a/test/routines/level2/xsbmv.h +++ b/test/routines/level2/xsbmv.h @@ -96,9 +96,9 @@ class TestXsbmv { auto status = clblasXsbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h index 2fdba77a..2add3cdd 100644 --- a/test/routines/level2/xspmv.h +++ b/test/routines/level2/xspmv.h @@ -96,9 +96,9 @@ class TestXspmv { auto status = clblasXspmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xspr.h b/test/routines/level2/xspr.h index dcacc5de..ad21bdf6 100644 --- a/test/routines/level2/xspr.h +++ b/test/routines/level2/xspr.h @@ -91,8 +91,8 @@ class TestXspr { auto status = clblasXspr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xspr2.h b/test/routines/level2/xspr2.h index 69fda2fb..c55e8181 100644 --- a/test/routines/level2/xspr2.h +++ b/test/routines/level2/xspr2.h @@ -96,9 +96,9 @@ class TestXspr2 { auto status = clblasXspr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xsymv.h b/test/routines/level2/xsymv.h index 16f94d6f..b6583a24 100644 --- a/test/routines/level2/xsymv.h +++ b/test/routines/level2/xsymv.h @@ -96,9 +96,9 @@ class TestXsymv { auto status = clblasXsymv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xsyr.h b/test/routines/level2/xsyr.h index a66dd271..f3929588 100644 --- a/test/routines/level2/xsyr.h +++ b/test/routines/level2/xsyr.h @@ -91,8 +91,8 @@ class TestXsyr { auto status = clblasXsyr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xsyr2.h b/test/routines/level2/xsyr2.h index a36815e5..8cdb6a14 100644 --- a/test/routines/level2/xsyr2.h +++ b/test/routines/level2/xsyr2.h @@ -96,9 +96,9 @@ class TestXsyr2 { auto status = clblasXsyr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h index 1425b60b..9c4131ec 100644 --- a/test/routines/level2/xtbmv.h +++ b/test/routines/level2/xtbmv.h @@ -92,8 +92,8 @@ class TestXtbmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, args.kl, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h index a834b437..58249227 100644 --- a/test/routines/level2/xtpmv.h +++ b/test/routines/level2/xtpmv.h @@ -92,8 +92,8 @@ class TestXtpmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h index cd502d5d..635a1319 100644 --- a/test/routines/level2/xtrmv.h +++ b/test/routines/level2/xtrmv.h @@ -92,8 +92,8 @@ class TestXtrmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h index cd5c2acd..842dae93 100644 --- a/test/routines/level3/xgemm.h +++ b/test/routines/level3/xgemm.h @@ -105,9 +105,9 @@ class TestXgemm { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h index edc71024..106b99ff 100644 --- a/test/routines/level3/xhemm.h +++ b/test/routines/level3/xhemm.h @@ -105,9 +105,9 @@ class TestXhemm { convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h index a78e1293..e2f4448f 100644 --- a/test/routines/level3/xher2k.h +++ b/test/routines/level3/xher2k.h @@ -105,9 +105,9 @@ class TestXher2k { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, alpha2, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h index 245293d6..43d7cfcd 100644 --- a/test/routines/level3/xherk.h +++ b/test/routines/level3/xherk.h @@ -95,8 +95,8 @@ class TestXherk { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h index e638b735..c32b4cf7 100644 --- a/test/routines/level3/xsymm.h +++ b/test/routines/level3/xsymm.h @@ -105,9 +105,9 @@ class TestXsymm { convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h index abac20f4..57c3c203 100644 --- a/test/routines/level3/xsyr2k.h +++ b/test/routines/level3/xsyr2k.h @@ -103,9 +103,9 @@ class TestXsyr2k { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h index 8a5fcb5f..6c3a3786 100644 --- a/test/routines/level3/xsyrk.h +++ b/test/routines/level3/xsyrk.h @@ -95,8 +95,8 @@ class TestXsyrk { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h index 7c9c21bc..3eb63030 100644 --- a/test/routines/level3/xtrmm.h +++ b/test/routines/level3/xtrmm.h @@ -97,8 +97,8 @@ class TestXtrmm { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h index 529acfbf..bf59aa94 100644 --- a/test/wrapper_cblas.h +++ b/test/wrapper_cblas.h @@ -161,6 +161,17 @@ void cblasXswap(const size_t n, reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); } +void cblasXswap(const size_t n, + std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXswap(n, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(x_buffer, x_buffer_bis); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL void cblasXscal(const size_t n, @@ -193,6 +204,15 @@ void cblasXscal(const size_t n, alpha_array.data(), reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } +void cblasXscal(const size_t n, + const half alpha, + std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + cblasXscal(n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(x_buffer, x_buffer_bis); +} // Forwards the Netlib BLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY void cblasXcopy(const size_t n, @@ -223,6 +243,16 @@ void cblasXcopy(const size_t n, reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); } +void cblasXcopy(const size_t n, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXcopy(n, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY void cblasXaxpy(const size_t n, @@ -263,6 +293,18 @@ void cblasXaxpy(const size_t n, reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc), reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); } +void cblasXaxpy(const size_t n, + const half alpha, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXaxpy(n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for SDOT/DDOT void cblasXdot(const size_t n, @@ -281,6 +323,19 @@ void cblasXdot(const size_t n, &x_buffer[x_offset], static_cast<int>(x_inc), &y_buffer[y_offset], static_cast<int>(y_inc)); } +void cblasXdot(const size_t n, + std::vector<half>& dot_buffer, const size_t dot_offset, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + auto dot_buffer_bis = HalfToFloatBuffer(dot_buffer); + cblasXdot(n, + dot_buffer_bis, dot_offset, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(dot_buffer, dot_buffer_bis); +} // Forwards the Netlib BLAS calls for CDOTU/ZDOTU void cblasXdotu(const size_t n, @@ -347,6 +402,16 @@ void cblasXnrm2(const size_t n, nrm2_buffer[nrm2_offset].real(cblas_dznrm2(n, reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc))); } +void cblasXnrm2(const size_t n, + std::vector<half>& nrm2_buffer, const size_t nrm2_offset, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto nrm2_buffer_bis = HalfToFloatBuffer(nrm2_buffer); + cblasXnrm2(n, + nrm2_buffer_bis, nrm2_offset, + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(nrm2_buffer, nrm2_buffer_bis); +} // Forwards the Netlib BLAS calls for SASUM/DASUM/ScASUM/DzASUM void cblasXasum(const size_t n, @@ -373,8 +438,18 @@ void cblasXasum(const size_t n, asum_buffer[asum_offset].real(cblas_dzasum(n, reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc))); } +void cblasXasum(const size_t n, + std::vector<half>& asum_buffer, const size_t asum_offset, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto asum_buffer_bis = HalfToFloatBuffer(asum_buffer); + cblasXasum(n, + asum_buffer_bis, asum_offset, + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(asum_buffer, asum_buffer_bis); +} -// Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX void cblasXamax(const size_t n, std::vector<float>& imax_buffer, const size_t imax_offset, const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) { @@ -399,6 +474,16 @@ void cblasXamax(const size_t n, ((int*)&imax_buffer[0])[imax_offset] = cblas_izamax(n, reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } +void cblasXamax(const size_t n, + std::vector<half>& imax_buffer, const size_t imax_offset, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer); + cblasXamax(n, + imax_buffer_bis, imax_offset, + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(imax_buffer, imax_buffer_bis); +} // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -469,6 +554,25 @@ void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, beta_array.data(), reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); } +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const half alpha, + const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXgemv(layout, a_transpose, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, @@ -535,6 +639,25 @@ void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, beta_array.data(), reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc)); } +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const half alpha, + const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXgbmv(layout, a_transpose, + m, n, kl, ku, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for CHEMV/ZHEMV void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, @@ -675,6 +798,25 @@ void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, beta, &y_buffer[y_offset], static_cast<int>(y_inc)); } +void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const half alpha, + const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXsymv(layout, triangle, + n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for SSBMV/DSBMV void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, @@ -707,6 +849,25 @@ void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, beta, &y_buffer[y_offset], static_cast<int>(y_inc)); } +void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const half alpha, + const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXsbmv(layout, triangle, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for SSPMV/DSPMV void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, @@ -739,6 +900,25 @@ void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, beta, &y_buffer[y_offset], static_cast<int>(y_inc)); } +void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const half alpha, + const std::vector<half>& ap_buffer, const size_t ap_offset, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) { + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXspmv(layout, triangle, + n, + HalfToFloat(alpha), + ap_buffer_bis, ap_offset, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for STRMV/DTRMV/CTRMV/ZTRMV void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, @@ -777,6 +957,18 @@ void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + cblasXtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(x_buffer, x_buffer_bis); +} // Forwards the Netlib BLAS calls for STBMV/DTBMV/CTBMV/ZTBMV void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, @@ -815,6 +1007,18 @@ void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + cblasXtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(x_buffer, x_buffer_bis); +} // Forwards the Netlib BLAS calls for STPMV/DTPMV/CTPMV/ZTPMV void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, @@ -853,6 +1057,18 @@ void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS reinterpret_cast<const double*>(&ap_buffer[ap_offset]), reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); } +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector<half>& ap_buffer, const size_t ap_offset, + std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) { + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + cblasXtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer_bis, ap_offset, + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(x_buffer, x_buffer_bis); +} // Forwards the Netlib BLAS calls for STRSV/DTRSV/CTRSV/ZTRSV void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, @@ -995,6 +1211,23 @@ void cblasXger(const CBLAS_ORDER layout, &y_buffer[y_offset], static_cast<int>(y_inc), &a_buffer[a_offset], a_ld); } +void cblasXger(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const half alpha, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + cblasXger(layout, + m, n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + a_buffer_bis, a_offset, a_ld); + FloatToHalfBuffer(a_buffer, a_buffer_bis); +} // Forwards the Netlib BLAS calls for CGERU/ZGERU void cblasXgeru(const CBLAS_ORDER layout, @@ -1187,6 +1420,20 @@ void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, &x_buffer[x_offset], static_cast<int>(x_inc), &a_buffer[a_offset], a_ld); } +void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const half alpha, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + cblasXsyr(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + a_buffer_bis, a_offset, a_ld); + FloatToHalfBuffer(a_buffer, a_buffer_bis); +} // Forwards the Netlib BLAS calls for SSPR/DSPR void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, @@ -1211,6 +1458,20 @@ void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, &x_buffer[x_offset], static_cast<int>(x_inc), &ap_buffer[ap_offset]); } +void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const half alpha, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector<half>& ap_buffer, const size_t ap_offset) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer); + cblasXspr(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + ap_buffer_bis, ap_offset); + FloatToHalfBuffer(ap_buffer, ap_buffer_bis); +} // Forwards the Netlib BLAS calls for SSYR2/DSYR2 void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, @@ -1239,6 +1500,23 @@ void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, &y_buffer[y_offset], static_cast<int>(y_inc), &a_buffer[a_offset], a_ld); } +void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const half alpha, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + cblasXsyr2(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + a_buffer_bis, a_offset, a_ld); + FloatToHalfBuffer(a_buffer, a_buffer_bis); +} // Forwards the Netlib BLAS calls for SSPR2/DSPR2 void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, @@ -1267,6 +1545,23 @@ void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, &y_buffer[y_offset], static_cast<int>(y_inc), &ap_buffer[ap_offset]); } +void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const half alpha, + const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector<half>& ap_buffer, const size_t ap_offset) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer); + cblasXspr2(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + ap_buffer_bis, ap_offset); + FloatToHalfBuffer(ap_buffer, ap_buffer_bis); +} // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -1337,6 +1632,25 @@ void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, con beta_array.data(), reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld); } +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const half alpha, + const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + std::vector<half>& c_buffer, const size_t c_offset, const size_t c_ld) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer); + cblasXgemm(layout, a_transpose, b_transpose, + m, n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld); + FloatToHalfBuffer(c_buffer, c_buffer_bis); +} // Forwards the Netlib BLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, @@ -1403,6 +1717,25 @@ void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL beta_array.data(), reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld); } +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const half alpha, + const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + std::vector<half>& c_buffer, const size_t c_offset, const size_t c_ld) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer); + cblasXsymm(layout, side, triangle, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld); + FloatToHalfBuffer(c_buffer, c_buffer_bis); +} // Forwards the Netlib BLAS calls for CHEMM/ZHEMM void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, @@ -1497,6 +1830,22 @@ void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS beta_array.data(), reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld); } +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const half alpha, + const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const half beta, + std::vector<half>& c_buffer, const size_t c_offset, const size_t c_ld) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer); + cblasXsyrk(layout, triangle, a_transpose, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld); + FloatToHalfBuffer(c_buffer, c_buffer_bis); +} // Forwards the Netlib BLAS calls for CHERK/ZHERK void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, @@ -1591,6 +1940,25 @@ void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA beta_array.data(), reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld); } +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const half alpha, + const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + std::vector<half>& c_buffer, const size_t c_offset, const size_t c_ld) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer); + cblasXsyr2k(layout, triangle, ab_transpose, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld); + FloatToHalfBuffer(c_buffer, c_buffer_bis); +} // Forwards the Netlib BLAS calls for CHER2K/ZHER2K void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, @@ -1673,6 +2041,20 @@ void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld); } +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const half alpha, + const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer); + cblasXtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld); + FloatToHalfBuffer(b_buffer, b_buffer_bis); +} // Forwards the Netlib BLAS calls for STRSM/DTRSM/CTRSM/ZTRSM void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, @@ -1721,6 +2103,20 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld, reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld); } +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const half alpha, + const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer); + cblasXtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld); + FloatToHalfBuffer(b_buffer, b_buffer_bis); +} // ================================================================================================= } // namespace clblast diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 23c55373..5115b3d9 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -34,104 +34,104 @@ clblasSide convertToCLBLAS(const Side v) { return (v == Side::kLeft) ? clblasLef // Forwards the clBLAS calls for SROTG/DROTG template <typename T> -clblasStatus clblasXrotg(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, +clblasStatus clblasXrotg(Buffer<T>& sa_buffer, const size_t sa_offset, + Buffer<T>& sb_buffer, const size_t sb_offset, + Buffer<T>& sc_buffer, const size_t sc_offset, + Buffer<T>& ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> -clblasStatus clblasXrotg<float>(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, +clblasStatus clblasXrotg<float>(Buffer<float>& sa_buffer, const size_t sa_offset, + Buffer<float>& sb_buffer, const size_t sb_offset, + Buffer<float>& sc_buffer, const size_t sc_offset, + Buffer<float>& ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSrotg(sa_buffer, sa_offset, - sb_buffer, sb_offset, - sc_buffer, sc_offset, - ss_buffer, ss_offset, + return clblasSrotg(sa_buffer(), sa_offset, + sb_buffer(), sb_offset, + sc_buffer(), sc_offset, + ss_buffer(), ss_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> -clblasStatus clblasXrotg<double>(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, +clblasStatus clblasXrotg<double>(Buffer<double>& sa_buffer, const size_t sa_offset, + Buffer<double>& sb_buffer, const size_t sb_offset, + Buffer<double>& sc_buffer, const size_t sc_offset, + Buffer<double>& ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDrotg(sa_buffer, sa_offset, - sb_buffer, sb_offset, - sc_buffer, sc_offset, - ss_buffer, ss_offset, + return clblasDrotg(sa_buffer(), sa_offset, + sb_buffer(), sb_offset, + sc_buffer(), sc_offset, + ss_buffer(), ss_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SROTMG/DROTMG template <typename T> -clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, +clblasStatus clblasXrotmg(Buffer<T>& sd1_buffer, const size_t sd1_offset, + Buffer<T>& sd2_buffer, const size_t sd2_offset, + Buffer<T>& sx1_buffer, const size_t sx1_offset, + const Buffer<T>& sy1_buffer, const size_t sy1_offset, + Buffer<T>& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> -clblasStatus clblasXrotmg<float>(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, +clblasStatus clblasXrotmg<float>(Buffer<float>& sd1_buffer, const size_t sd1_offset, + Buffer<float>& sd2_buffer, const size_t sd2_offset, + Buffer<float>& sx1_buffer, const size_t sx1_offset, + const Buffer<float>& sy1_buffer, const size_t sy1_offset, + Buffer<float>& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSrotmg(sd1_buffer, sd1_offset, - sd2_buffer, sd2_offset, - sx1_buffer, sx1_offset, - sy1_buffer, sy1_offset, - sparam_buffer, sparam_offset, + return clblasSrotmg(sd1_buffer(), sd1_offset, + sd2_buffer(), sd2_offset, + sx1_buffer(), sx1_offset, + sy1_buffer(), sy1_offset, + sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> -clblasStatus clblasXrotmg<double>(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, +clblasStatus clblasXrotmg<double>(Buffer<double>& sd1_buffer, const size_t sd1_offset, + Buffer<double>& sd2_buffer, const size_t sd2_offset, + Buffer<double>& sx1_buffer, const size_t sx1_offset, + const Buffer<double>& sy1_buffer, const size_t sy1_offset, + Buffer<double>& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDrotmg(sd1_buffer, sd1_offset, - sd2_buffer, sd2_offset, - sx1_buffer, sx1_offset, - sy1_buffer, sy1_offset, - sparam_buffer, sparam_offset, + return clblasDrotmg(sd1_buffer(), sd1_offset, + sd2_buffer(), sd2_offset, + sx1_buffer(), sx1_offset, + sy1_buffer(), sy1_offset, + sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SROT/DROT clblasStatus clblasXrot(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, const float cos, const float sin, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSrot(n, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), cos, sin, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXrot(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, const double cos, const double sin, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDrot(n, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), cos, sin, num_queues, queues, num_wait_events, wait_events, events); @@ -140,316 +140,394 @@ clblasStatus clblasXrot(const size_t n, // Forwards the clBLAS calls for SROTM/DROTM template <typename T> clblasStatus clblasXrotm(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, + Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<T>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<T>& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXrotm<float>(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, + Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float>& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSrotm(n, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - sparam_buffer, sparam_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXrotm<double>(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, + Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double>& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDrotm(n, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - sparam_buffer, sparam_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP template <typename T> clblasStatus clblasXswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<T>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXswap<float>(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSswap(n, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap<double>(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDswap(n, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap<float2>(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCswap(n, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap<double2>(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZswap(n, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXswap<half>(const size_t n, + Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXswap(n, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL clblasStatus clblasXscal(const size_t n, const float alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSscal(n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const double alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDscal(n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const float2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCscal(n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast<int>(x_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const double2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZscal(n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast<int>(x_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXscal(const size_t n, + const half alpha, + Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto status = clblasXscal(n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY template <typename T> clblasStatus clblasXcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<T>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXcopy<float>(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasScopy(n, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy<double>(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDcopy(n, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy<float2>(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCcopy(n, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy<double2>(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZcopy(n, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXcopy<half>(const size_t n, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXcopy(n, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY clblasStatus clblasXaxpy(const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSaxpy(n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDaxpy(n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCaxpy(n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZaxpy(n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXaxpy(const size_t n, + const half alpha, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXaxpy(n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SDOT/DDOT template <typename T> clblasStatus clblasXdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<T>& dot_buffer, const size_t dot_offset, + const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXdot<float>(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float>& dot_buffer, const size_t dot_offset, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<float>(context, n); return clblasSdot(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdot<double>(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double>& dot_buffer, const size_t dot_offset, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<double>(context, n); return clblasDdot(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXdot<half>(const size_t n, + Buffer<half>& dot_buffer, const size_t dot_offset, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto dot_buffer_bis = HalfToFloatBuffer(dot_buffer, queues[0]); + auto status = clblasXdot(n, + dot_buffer_bis, dot_offset, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(dot_buffer, dot_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for CDOTU/ZDOTU template <typename T> clblasStatus clblasXdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<T>& dot_buffer, const size_t dot_offset, + const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXdotu<float2>(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float2>& dot_buffer, const size_t dot_offset, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<float2>(context, n); return clblasCdotu(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdotu<double2>(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double2>& dot_buffer, const size_t dot_offset, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<double2>(context, n); return clblasZdotu(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } @@ -457,42 +535,42 @@ clblasStatus clblasXdotu<double2>(const size_t n, // Forwards the clBLAS calls for CDOTC/ZDOTC template <typename T> clblasStatus clblasXdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<T>& dot_buffer, const size_t dot_offset, + const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXdotc<float2>(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float2>& dot_buffer, const size_t dot_offset, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<float2>(context, n); return clblasCdotc(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdotc<double2>(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double2>& dot_buffer, const size_t dot_offset, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<double2>(context, n); return clblasZdotc(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } @@ -500,206 +578,251 @@ clblasStatus clblasXdotc<double2>(const size_t n, // Forwards the clBLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2 template <typename T> clblasStatus clblasXnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<T>& nrm2_buffer, const size_t nrm2_offset, + const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXnrm2<float>(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float>& nrm2_buffer, const size_t nrm2_offset, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<float>(context, 2*n); return clblasSnrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + nrm2_buffer(), nrm2_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2<double>(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double>& nrm2_buffer, const size_t nrm2_offset, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<double>(context, 2*n); return clblasDnrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + nrm2_buffer(), nrm2_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2<float2>(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float2>& nrm2_buffer, const size_t nrm2_offset, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<float2>(context, 2*n); return clblasScnrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + nrm2_buffer(), nrm2_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2<double2>(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double2>& nrm2_buffer, const size_t nrm2_offset, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<double2>(context, 2*n); return clblasDznrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + nrm2_buffer(), nrm2_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXnrm2<half>(const size_t n, + Buffer<half>& nrm2_buffer, const size_t nrm2_offset, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto nrm2_buffer_bis = HalfToFloatBuffer(nrm2_buffer, queues[0]); + auto status = clblasXnrm2(n, + nrm2_buffer_bis, nrm2_offset, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(nrm2_buffer, nrm2_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SASUM/DASUM/ScASUM/DzASUM template <typename T> clblasStatus clblasXasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<T>& asum_buffer, const size_t asum_offset, + const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXasum<float>(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float>& asum_buffer, const size_t asum_offset, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<float>(context, n); return clblasSasum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + asum_buffer(), asum_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum<double>(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double>& asum_buffer, const size_t asum_offset, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<double>(context, n); return clblasDasum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + asum_buffer(), asum_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum<float2>(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float2>& asum_buffer, const size_t asum_offset, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<float2>(context, n); return clblasScasum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + asum_buffer(), asum_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum<double2>(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double2>& asum_buffer, const size_t asum_offset, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<double2>(context, n); return clblasDzasum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + asum_buffer(), asum_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXasum<half>(const size_t n, + Buffer<half>& asum_buffer, const size_t asum_offset, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto asum_buffer_bis = HalfToFloatBuffer(asum_buffer, queues[0]); + auto status = clblasXasum(n, + asum_buffer_bis, asum_offset, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(asum_buffer, asum_buffer_bis, queues[0]); + return status; +} -// Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template <typename T> clblasStatus clblasXamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<T>& imax_buffer, const size_t imax_offset, + const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXamax<float>(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float>& imax_buffer, const size_t imax_offset, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<float>(context, 2*n); return clblasiSamax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + imax_buffer(), imax_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax<double>(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double>& imax_buffer, const size_t imax_offset, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<double>(context, 2*n); return clblasiDamax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + imax_buffer(), imax_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax<float2>(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float2>& imax_buffer, const size_t imax_offset, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<float2>(context, 2*n); return clblasiCamax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + imax_buffer(), imax_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax<double2>(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double2>& imax_buffer, const size_t imax_offset, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer<double2>(context, 2*n); return clblasiZamax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + imax_buffer(), imax_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXamax<half>(const size_t n, + Buffer<half>& imax_buffer, const size_t imax_offset, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer, queues[0]); + auto status = clblasXamax(n, + imax_buffer_bis, imax_offset, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(imax_buffer, imax_buffer_bis, queues[0]); + return status; +} // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -709,185 +832,231 @@ clblasStatus clblasXamax<double2>(const size_t n, clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSgemv(layout, a_transpose, m, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), beta, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDgemv(layout, a_transpose, m, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), beta, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgemv(layout, a_transpose, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgemv(layout, a_transpose, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, + const half alpha, + const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXgemv(layout, a_transpose, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSgbmv(layout, a_transpose, m, n, kl, ku, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), beta, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDgbmv(layout, a_transpose, m, n, kl, ku, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), beta, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgbmv(layout, a_transpose, m, n, kl, ku, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgbmv(layout, a_transpose, m, n, kl, ku, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const half alpha, + const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXgbmv(layout, a_transpose, + m, n, kl, ku, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for CHEMV/ZHEMV clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChemv(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhemv(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -895,37 +1064,37 @@ clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChbmv(layout, triangle, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhbmv(layout, triangle, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -933,37 +1102,37 @@ clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& ap_buffer, const size_t ap_offset, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChpmv(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& ap_buffer, const size_t ap_offset, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhpmv(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -971,129 +1140,198 @@ clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsymv(layout, triangle, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), beta, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsymv(layout, triangle, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), beta, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const half alpha, + const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXsymv(layout, triangle, + n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSBMV/DSBMV clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsbmv(layout, triangle, n, k, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), beta, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsbmv(layout, triangle, n, k, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), beta, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const half alpha, + const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXsbmv(layout, triangle, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSPMV/DSPMV clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& ap_buffer, const size_t ap_offset, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSspmv(layout, triangle, n, alpha, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), beta, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& ap_buffer, const size_t ap_offset, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDspmv(layout, triangle, n, alpha, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), beta, - y_buffer, y_offset, static_cast<int>(y_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const half alpha, + const Buffer<half>& ap_buffer, const size_t ap_offset, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXspmv(layout, triangle, + n, + HalfToFloat(alpha), + ap_buffer_bis, ap_offset, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for STRMV/DTRMV/CTRMV/ZTRMV template <typename T> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtrmv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1101,16 +1339,16 @@ clblasStatus clblasXtrmv<float>(const clblasOrder layout, const clblasUplo trian auto scratch_buffer = Buffer<float>(context, n); return clblasStrmv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1118,16 +1356,16 @@ clblasStatus clblasXtrmv<double>(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer<double>(context, n); return clblasDtrmv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1135,16 +1373,16 @@ clblasStatus clblasXtrmv<float2>(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer<float2>(context, n); return clblasCtrmv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1152,25 +1390,42 @@ clblasStatus clblasXtrmv<double2>(const clblasOrder layout, const clblasUplo tri auto scratch_buffer = Buffer<double2>(context, n); return clblasZtrmv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXtrmv<half>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto status = clblasXtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for STBMV/DTBMV/CTBMV/ZTBMV template <typename T> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtbmv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1178,16 +1433,16 @@ clblasStatus clblasXtbmv<float>(const clblasOrder layout, const clblasUplo trian auto scratch_buffer = Buffer<float>(context, n); return clblasStbmv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1195,16 +1450,16 @@ clblasStatus clblasXtbmv<double>(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer<double>(context, n); return clblasDtbmv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1212,16 +1467,16 @@ clblasStatus clblasXtbmv<float2>(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer<float2>(context, n); return clblasCtbmv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1229,25 +1484,42 @@ clblasStatus clblasXtbmv<double2>(const clblasOrder layout, const clblasUplo tri auto scratch_buffer = Buffer<double2>(context, n); return clblasZtbmv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXtbmv<half>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto status = clblasXtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for STPMV/DTPMV/CTPMV/ZTPMV template <typename T> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T>& ap_buffer, const size_t ap_offset, + Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtpmv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& ap_buffer, const size_t ap_offset, + Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1255,16 +1527,16 @@ clblasStatus clblasXtpmv<float>(const clblasOrder layout, const clblasUplo trian auto scratch_buffer = Buffer<float>(context, n); return clblasStpmv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& ap_buffer, const size_t ap_offset, + Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1272,16 +1544,16 @@ clblasStatus clblasXtpmv<double>(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer<double>(context, n); return clblasDtpmv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& ap_buffer, const size_t ap_offset, + Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1289,16 +1561,16 @@ clblasStatus clblasXtpmv<float2>(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer<float2>(context, n); return clblasCtpmv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& ap_buffer, const size_t ap_offset, + Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1306,70 +1578,87 @@ clblasStatus clblasXtpmv<double2>(const clblasOrder layout, const clblasUplo tri auto scratch_buffer = Buffer<double2>(context, n); return clblasZtpmv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXtpmv<half>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const Buffer<half>& ap_buffer, const size_t ap_offset, + Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto status = clblasXtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer_bis, ap_offset, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for STRSV/DTRSV/CTRSV/ZTRSV template <typename T> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtrsv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStrsv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrsv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtrsv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrsv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtrsv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrsv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtrsv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -1377,60 +1666,60 @@ clblasStatus clblasXtrsv<double2>(const clblasOrder layout, const clblasUplo tri template <typename T> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtbsv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStbsv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbsv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtbsv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbsv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtbsv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbsv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtbsv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -1438,60 +1727,60 @@ clblasStatus clblasXtbsv<double2>(const clblasOrder layout, const clblasUplo tri template <typename T> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T>& ap_buffer, const size_t ap_offset, + Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtpsv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& ap_buffer, const size_t ap_offset, + Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStpsv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpsv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& ap_buffer, const size_t ap_offset, + Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtpsv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpsv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& ap_buffer, const size_t ap_offset, + Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtpsv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpsv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& ap_buffer, const size_t ap_offset, + Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtpsv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -1499,67 +1788,88 @@ clblasStatus clblasXtpsv<double2>(const clblasOrder layout, const clblasUplo tri clblasStatus clblasXger(const clblasOrder layout, const size_t m, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSger(layout, m, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXger(const clblasOrder layout, const size_t m, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDger(layout, m, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXger(const clblasOrder layout, + const size_t m, const size_t n, + const half alpha, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto status = clblasXger(layout, + m, n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + a_buffer_bis, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for CGERU/ZGERU clblasStatus clblasXgeru(const clblasOrder layout, const size_t m, const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgeru(layout, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgeru(const clblasOrder layout, const size_t m, const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgeru(layout, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -1567,33 +1877,33 @@ clblasStatus clblasXgeru(const clblasOrder layout, clblasStatus clblasXgerc(const clblasOrder layout, const size_t m, const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgerc(layout, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgerc(const clblasOrder layout, const size_t m, const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgerc(layout, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -1601,29 +1911,29 @@ clblasStatus clblasXgerc(const clblasOrder layout, clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCher(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZher(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -1631,29 +1941,29 @@ clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float2>& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChpr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double2>& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhpr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } @@ -1661,33 +1971,33 @@ clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCher2(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZher2(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -1695,33 +2005,33 @@ clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float2>& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChpr2(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double2>& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhpr2(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } @@ -1729,129 +2039,207 @@ clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const half alpha, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto status = clblasXsyr(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + a_buffer_bis, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSPR/DSPR clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<float>& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSspr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<double>& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDspr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const half alpha, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer<half>& ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); + auto status = clblasXspr(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + ap_buffer_bis, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(ap_buffer, ap_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSYR2/DSYR2 clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyr2(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyr2(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const half alpha, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto status = clblasXsyr2(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + a_buffer_bis, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSPR2/DSPR2 clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<float>& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSspr2(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<double>& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDspr2(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast<int>(x_inc), - y_buffer, y_offset, static_cast<int>(y_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast<int>(x_inc), + y_buffer(), y_offset, static_cast<int>(y_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const half alpha, + const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer<half>& ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); + auto status = clblasXspr2(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + ap_buffer_bis, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(ap_buffer, ap_buffer_bis, queues[0]); + return status; +} // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -1861,185 +2249,231 @@ clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float>& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<float>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSgemm(layout, a_transpose, b_transpose, m, n, k, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double>& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<double>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDgemm(layout, a_transpose, b_transpose, m, n, k, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgemm(layout, a_transpose, b_transpose, m, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgemm(layout, a_transpose, b_transpose, m, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, + const size_t m, const size_t n, const size_t k, + const half alpha, + const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + Buffer<half>& c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); + auto status = clblasXgemm(layout, a_transpose, b_transpose, + m, n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float>& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<float>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsymm(layout, side, triangle, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double>& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<double>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsymm(layout, side, triangle, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCsymm(layout, side, triangle, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZsymm(layout, side, triangle, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const half alpha, + const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + Buffer<half>& c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); + auto status = clblasXsymm(layout, side, triangle, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for CHEMM/ZHEMM clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChemm(layout, side, triangle, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhemm(layout, side, triangle, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -2047,99 +2481,119 @@ clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<float>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyrk(layout, triangle, a_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<double>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyrk(layout, triangle, a_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCsyrk(layout, triangle, a_transpose, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZsyrk(layout, triangle, a_transpose, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const half alpha, + const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const half beta, + Buffer<half>& c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); + auto status = clblasXsyrk(layout, triangle, a_transpose, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for CHERK/ZHERK clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCherk(layout, triangle, a_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZherk(layout, triangle, a_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -2147,111 +2601,134 @@ clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, co clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float>& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<float>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyr2k(layout, triangle, ab_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double>& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<double>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyr2k(layout, triangle, ab_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCsyr2k(layout, triangle, ab_transpose, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZsyr2k(layout, triangle, ab_transpose, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const half alpha, + const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + Buffer<half>& c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); + auto status = clblasXsyr2k(layout, triangle, ab_transpose, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for CHER2K/ZHER2K clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCher2k(layout, triangle, ab_transpose, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZher2k(layout, triangle, ab_transpose, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -2259,117 +2736,153 @@ clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, c clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<float>& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<double>& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtrmm(layout, side, triangle, a_transpose, diagonal, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtrmm(layout, side, triangle, a_transpose, diagonal, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const half alpha, + const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto status = clblasXtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for STRSM/DTRSM/CTRSM/ZTRSM clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<float>& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<double>& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtrsm(layout, side, triangle, a_transpose, diagonal, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtrsm(layout, side, triangle, a_transpose, diagonal, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const half alpha, + const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto status = clblasXtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]); + return status; +} // ================================================================================================= } // namespace clblast |