From 61203453aaca4e47c05c598a673150522160ca87 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 19 Jun 2016 13:55:49 +0200 Subject: Renamed all C++ source files to .cpp to match the .hpp extension better --- CHANGELOG | 1 + CMakeLists.txt | 46 +- README.md | 2 +- samples/sgemm.cc | 107 - samples/sgemm.cpp | 107 + scripts/generator/generator.py | 20 +- src/cache.cc | 111 - src/cache.cpp | 111 + src/clblast.cc | 2193 ------------------ src/clblast.cpp | 2193 ++++++++++++++++++ src/clblast_c.cc | 2927 ------------------------ src/clblast_c.cpp | 2927 ++++++++++++++++++++++++ src/database/database.cc | 120 - src/database/database.cpp | 120 + src/routine.cc | 131 -- src/routine.cpp | 131 ++ src/routines/common.cc | 65 - src/routines/common.cpp | 65 + src/routines/level1/xamax.cc | 105 - src/routines/level1/xamax.cpp | 105 + src/routines/level1/xasum.cc | 102 - src/routines/level1/xasum.cpp | 102 + src/routines/level1/xaxpy.cc | 113 - src/routines/level1/xaxpy.cpp | 113 + src/routines/level1/xcopy.cc | 107 - src/routines/level1/xcopy.cpp | 107 + src/routines/level1/xdot.cc | 110 - src/routines/level1/xdot.cpp | 110 + src/routines/level1/xdotc.cc | 49 - src/routines/level1/xdotc.cpp | 49 + src/routines/level1/xdotu.cc | 48 - src/routines/level1/xdotu.cpp | 48 + src/routines/level1/xnrm2.cc | 102 - src/routines/level1/xnrm2.cpp | 102 + src/routines/level1/xscal.cc | 101 - src/routines/level1/xscal.cpp | 101 + src/routines/level1/xswap.cc | 107 - src/routines/level1/xswap.cpp | 107 + src/routines/level2/xgbmv.cc | 68 - src/routines/level2/xgbmv.cpp | 68 + src/routines/level2/xgemv.cc | 181 -- src/routines/level2/xgemv.cpp | 181 ++ src/routines/level2/xger.cc | 106 - src/routines/level2/xger.cpp | 106 + src/routines/level2/xgerc.cc | 53 - src/routines/level2/xgerc.cpp | 53 + src/routines/level2/xgeru.cc | 52 - src/routines/level2/xgeru.cpp | 52 + src/routines/level2/xhbmv.cc | 64 - src/routines/level2/xhbmv.cpp | 64 + src/routines/level2/xhemv.cc | 64 - src/routines/level2/xhemv.cpp | 64 + src/routines/level2/xher.cc | 117 - src/routines/level2/xher.cpp | 117 + src/routines/level2/xher2.cc | 108 - src/routines/level2/xher2.cpp | 108 + src/routines/level2/xhpmv.cc | 64 - src/routines/level2/xhpmv.cpp | 64 + src/routines/level2/xhpr.cc | 51 - src/routines/level2/xhpr.cpp | 51 + src/routines/level2/xhpr2.cc | 53 - src/routines/level2/xhpr2.cpp | 53 + src/routines/level2/xsbmv.cc | 65 - src/routines/level2/xsbmv.cpp | 65 + src/routines/level2/xspmv.cc | 65 - src/routines/level2/xspmv.cpp | 65 + src/routines/level2/xspr.cc | 52 - src/routines/level2/xspr.cpp | 52 + src/routines/level2/xspr2.cc | 54 - src/routines/level2/xspr2.cpp | 54 + src/routines/level2/xsymv.cc | 65 - src/routines/level2/xsymv.cpp | 65 + src/routines/level2/xsyr.cc | 51 - src/routines/level2/xsyr.cpp | 51 + src/routines/level2/xsyr2.cc | 53 - src/routines/level2/xsyr2.cpp | 53 + src/routines/level2/xtbmv.cc | 82 - src/routines/level2/xtbmv.cpp | 82 + src/routines/level2/xtpmv.cc | 82 - src/routines/level2/xtpmv.cpp | 82 + src/routines/level2/xtrmv.cc | 82 - src/routines/level2/xtrmv.cpp | 82 + src/routines/level3/xgemm.cc | 223 -- src/routines/level3/xgemm.cpp | 223 ++ src/routines/level3/xhemm.cc | 134 -- src/routines/level3/xhemm.cpp | 134 ++ src/routines/level3/xher2k.cc | 241 -- src/routines/level3/xher2k.cpp | 241 ++ src/routines/level3/xherk.cc | 197 -- src/routines/level3/xherk.cpp | 197 ++ src/routines/level3/xsymm.cc | 137 -- src/routines/level3/xsymm.cpp | 137 ++ src/routines/level3/xsyr2k.cc | 210 -- src/routines/level3/xsyr2k.cpp | 210 ++ src/routines/level3/xsyrk.cc | 181 -- src/routines/level3/xsyrk.cpp | 181 ++ src/routines/level3/xtrmm.cc | 140 -- src/routines/level3/xtrmm.cpp | 140 ++ src/routines/levelx/xomatcopy.cc | 94 - src/routines/levelx/xomatcopy.cpp | 94 + src/tuning/kernels/copy_fast.cc | 122 - src/tuning/kernels/copy_fast.cpp | 122 + src/tuning/kernels/copy_pad.cc | 130 -- src/tuning/kernels/copy_pad.cpp | 130 ++ src/tuning/kernels/transpose_fast.cc | 127 - src/tuning/kernels/transpose_fast.cpp | 127 + src/tuning/kernels/transpose_pad.cc | 134 -- src/tuning/kernels/transpose_pad.cpp | 134 ++ src/tuning/kernels/xaxpy.cc | 125 - src/tuning/kernels/xaxpy.cpp | 125 + src/tuning/kernels/xdot.cc | 137 -- src/tuning/kernels/xdot.cpp | 137 ++ src/tuning/kernels/xgemm.cc | 162 -- src/tuning/kernels/xgemm.cpp | 162 ++ src/tuning/kernels/xgemv.cc | 156 -- src/tuning/kernels/xgemv.cpp | 156 ++ src/tuning/kernels/xger.cc | 130 -- src/tuning/kernels/xger.cpp | 130 ++ src/utilities.cc | 390 ---- src/utilities.cpp | 390 ++++ test/correctness/routines/level1/xamax.cc | 30 - test/correctness/routines/level1/xamax.cpp | 30 + test/correctness/routines/level1/xasum.cc | 30 - test/correctness/routines/level1/xasum.cpp | 30 + test/correctness/routines/level1/xaxpy.cc | 30 - test/correctness/routines/level1/xaxpy.cpp | 30 + test/correctness/routines/level1/xcopy.cc | 30 - test/correctness/routines/level1/xcopy.cpp | 30 + test/correctness/routines/level1/xdot.cc | 28 - test/correctness/routines/level1/xdot.cpp | 28 + test/correctness/routines/level1/xdotc.cc | 27 - test/correctness/routines/level1/xdotc.cpp | 27 + test/correctness/routines/level1/xdotu.cc | 27 - test/correctness/routines/level1/xdotu.cpp | 27 + test/correctness/routines/level1/xnrm2.cc | 30 - test/correctness/routines/level1/xnrm2.cpp | 30 + test/correctness/routines/level1/xrot.cc | 27 - test/correctness/routines/level1/xrot.cpp | 27 + test/correctness/routines/level1/xrotg.cc | 27 - test/correctness/routines/level1/xrotg.cpp | 27 + test/correctness/routines/level1/xrotm.cc | 27 - test/correctness/routines/level1/xrotm.cpp | 27 + test/correctness/routines/level1/xrotmg.cc | 27 - test/correctness/routines/level1/xrotmg.cpp | 27 + test/correctness/routines/level1/xscal.cc | 30 - test/correctness/routines/level1/xscal.cpp | 30 + test/correctness/routines/level1/xswap.cc | 30 - test/correctness/routines/level1/xswap.cpp | 30 + test/correctness/routines/level2/xgbmv.cc | 30 - test/correctness/routines/level2/xgbmv.cpp | 30 + test/correctness/routines/level2/xgemv.cc | 30 - test/correctness/routines/level2/xgemv.cpp | 30 + test/correctness/routines/level2/xger.cc | 28 - test/correctness/routines/level2/xger.cpp | 28 + test/correctness/routines/level2/xgerc.cc | 27 - test/correctness/routines/level2/xgerc.cpp | 27 + test/correctness/routines/level2/xgeru.cc | 27 - test/correctness/routines/level2/xgeru.cpp | 27 + test/correctness/routines/level2/xhbmv.cc | 27 - test/correctness/routines/level2/xhbmv.cpp | 27 + test/correctness/routines/level2/xhemv.cc | 27 - test/correctness/routines/level2/xhemv.cpp | 27 + test/correctness/routines/level2/xher.cc | 27 - test/correctness/routines/level2/xher.cpp | 27 + test/correctness/routines/level2/xher2.cc | 27 - test/correctness/routines/level2/xher2.cpp | 27 + test/correctness/routines/level2/xhpmv.cc | 27 - test/correctness/routines/level2/xhpmv.cpp | 27 + test/correctness/routines/level2/xhpr.cc | 27 - test/correctness/routines/level2/xhpr.cpp | 27 + test/correctness/routines/level2/xhpr2.cc | 27 - test/correctness/routines/level2/xhpr2.cpp | 27 + test/correctness/routines/level2/xsbmv.cc | 28 - test/correctness/routines/level2/xsbmv.cpp | 28 + test/correctness/routines/level2/xspmv.cc | 28 - test/correctness/routines/level2/xspmv.cpp | 28 + test/correctness/routines/level2/xspr.cc | 28 - test/correctness/routines/level2/xspr.cpp | 28 + test/correctness/routines/level2/xspr2.cc | 28 - test/correctness/routines/level2/xspr2.cpp | 28 + test/correctness/routines/level2/xsymv.cc | 28 - test/correctness/routines/level2/xsymv.cpp | 28 + test/correctness/routines/level2/xsyr.cc | 28 - test/correctness/routines/level2/xsyr.cpp | 28 + test/correctness/routines/level2/xsyr2.cc | 28 - test/correctness/routines/level2/xsyr2.cpp | 28 + test/correctness/routines/level2/xtbmv.cc | 30 - test/correctness/routines/level2/xtbmv.cpp | 30 + test/correctness/routines/level2/xtbsv.cc | 29 - test/correctness/routines/level2/xtbsv.cpp | 29 + test/correctness/routines/level2/xtpmv.cc | 30 - test/correctness/routines/level2/xtpmv.cpp | 30 + test/correctness/routines/level2/xtpsv.cc | 29 - test/correctness/routines/level2/xtpsv.cpp | 29 + test/correctness/routines/level2/xtrmv.cc | 30 - test/correctness/routines/level2/xtrmv.cpp | 30 + test/correctness/routines/level2/xtrsv.cc | 29 - test/correctness/routines/level2/xtrsv.cpp | 29 + test/correctness/routines/level3/xgemm.cc | 30 - test/correctness/routines/level3/xgemm.cpp | 30 + test/correctness/routines/level3/xhemm.cc | 27 - test/correctness/routines/level3/xhemm.cpp | 27 + test/correctness/routines/level3/xher2k.cc | 27 - test/correctness/routines/level3/xher2k.cpp | 27 + test/correctness/routines/level3/xherk.cc | 27 - test/correctness/routines/level3/xherk.cpp | 27 + test/correctness/routines/level3/xsymm.cc | 30 - test/correctness/routines/level3/xsymm.cpp | 30 + test/correctness/routines/level3/xsyr2k.cc | 30 - test/correctness/routines/level3/xsyr2k.cpp | 30 + test/correctness/routines/level3/xsyrk.cc | 30 - test/correctness/routines/level3/xsyrk.cpp | 30 + test/correctness/routines/level3/xtrmm.cc | 30 - test/correctness/routines/level3/xtrmm.cpp | 30 + test/correctness/routines/level3/xtrsm.cc | 30 - test/correctness/routines/level3/xtrsm.cpp | 30 + test/correctness/routines/levelx/xomatcopy.cc | 30 - test/correctness/routines/levelx/xomatcopy.cpp | 30 + test/correctness/testblas.cc | 244 -- test/correctness/testblas.cpp | 244 ++ test/correctness/tester.cc | 441 ---- test/correctness/tester.cpp | 441 ++++ test/performance/client.cc | 375 --- test/performance/client.cpp | 375 +++ test/performance/routines/level1/xamax.cc | 36 - test/performance/routines/level1/xamax.cpp | 36 + test/performance/routines/level1/xasum.cc | 36 - test/performance/routines/level1/xasum.cpp | 36 + test/performance/routines/level1/xaxpy.cc | 36 - test/performance/routines/level1/xaxpy.cpp | 36 + test/performance/routines/level1/xcopy.cc | 36 - test/performance/routines/level1/xcopy.cpp | 36 + test/performance/routines/level1/xdot.cc | 34 - test/performance/routines/level1/xdot.cpp | 34 + test/performance/routines/level1/xdotc.cc | 33 - test/performance/routines/level1/xdotc.cpp | 33 + test/performance/routines/level1/xdotu.cc | 33 - test/performance/routines/level1/xdotu.cpp | 33 + test/performance/routines/level1/xnrm2.cc | 36 - test/performance/routines/level1/xnrm2.cpp | 36 + test/performance/routines/level1/xrot.cc | 33 - test/performance/routines/level1/xrot.cpp | 33 + test/performance/routines/level1/xrotg.cc | 33 - test/performance/routines/level1/xrotg.cpp | 33 + test/performance/routines/level1/xrotm.cc | 33 - test/performance/routines/level1/xrotm.cpp | 33 + test/performance/routines/level1/xrotmg.cc | 33 - test/performance/routines/level1/xrotmg.cpp | 33 + test/performance/routines/level1/xscal.cc | 36 - test/performance/routines/level1/xscal.cpp | 36 + test/performance/routines/level1/xswap.cc | 36 - test/performance/routines/level1/xswap.cpp | 36 + test/performance/routines/level2/xgbmv.cc | 36 - test/performance/routines/level2/xgbmv.cpp | 36 + test/performance/routines/level2/xgemv.cc | 36 - test/performance/routines/level2/xgemv.cpp | 36 + test/performance/routines/level2/xger.cc | 34 - test/performance/routines/level2/xger.cpp | 34 + test/performance/routines/level2/xgerc.cc | 33 - test/performance/routines/level2/xgerc.cpp | 33 + test/performance/routines/level2/xgeru.cc | 33 - test/performance/routines/level2/xgeru.cpp | 33 + test/performance/routines/level2/xhbmv.cc | 33 - test/performance/routines/level2/xhbmv.cpp | 33 + test/performance/routines/level2/xhemv.cc | 33 - test/performance/routines/level2/xhemv.cpp | 33 + test/performance/routines/level2/xher.cc | 33 - test/performance/routines/level2/xher.cpp | 33 + test/performance/routines/level2/xher2.cc | 33 - test/performance/routines/level2/xher2.cpp | 33 + test/performance/routines/level2/xhpmv.cc | 33 - test/performance/routines/level2/xhpmv.cpp | 33 + test/performance/routines/level2/xhpr.cc | 33 - test/performance/routines/level2/xhpr.cpp | 33 + test/performance/routines/level2/xhpr2.cc | 33 - test/performance/routines/level2/xhpr2.cpp | 33 + test/performance/routines/level2/xsbmv.cc | 34 - test/performance/routines/level2/xsbmv.cpp | 34 + test/performance/routines/level2/xspmv.cc | 34 - test/performance/routines/level2/xspmv.cpp | 34 + test/performance/routines/level2/xspr.cc | 34 - test/performance/routines/level2/xspr.cpp | 34 + test/performance/routines/level2/xspr2.cc | 34 - test/performance/routines/level2/xspr2.cpp | 34 + test/performance/routines/level2/xsymv.cc | 34 - test/performance/routines/level2/xsymv.cpp | 34 + test/performance/routines/level2/xsyr.cc | 34 - test/performance/routines/level2/xsyr.cpp | 34 + test/performance/routines/level2/xsyr2.cc | 34 - test/performance/routines/level2/xsyr2.cpp | 34 + test/performance/routines/level2/xtbmv.cc | 36 - test/performance/routines/level2/xtbmv.cpp | 36 + test/performance/routines/level2/xtbsv.cc | 35 - test/performance/routines/level2/xtbsv.cpp | 35 + test/performance/routines/level2/xtpmv.cc | 36 - test/performance/routines/level2/xtpmv.cpp | 36 + test/performance/routines/level2/xtpsv.cc | 35 - test/performance/routines/level2/xtpsv.cpp | 35 + test/performance/routines/level2/xtrmv.cc | 36 - test/performance/routines/level2/xtrmv.cpp | 36 + test/performance/routines/level2/xtrsv.cc | 35 - test/performance/routines/level2/xtrsv.cpp | 35 + test/performance/routines/level3/xgemm.cc | 36 - test/performance/routines/level3/xgemm.cpp | 36 + test/performance/routines/level3/xhemm.cc | 33 - test/performance/routines/level3/xhemm.cpp | 33 + test/performance/routines/level3/xher2k.cc | 33 - test/performance/routines/level3/xher2k.cpp | 33 + test/performance/routines/level3/xherk.cc | 33 - test/performance/routines/level3/xherk.cpp | 33 + test/performance/routines/level3/xsymm.cc | 36 - test/performance/routines/level3/xsymm.cpp | 36 + test/performance/routines/level3/xsyr2k.cc | 36 - test/performance/routines/level3/xsyr2k.cpp | 36 + test/performance/routines/level3/xsyrk.cc | 36 - test/performance/routines/level3/xsyrk.cpp | 36 + test/performance/routines/level3/xtrmm.cc | 36 - test/performance/routines/level3/xtrmm.cpp | 36 + test/performance/routines/level3/xtrsm.cc | 36 - test/performance/routines/level3/xtrsm.cpp | 36 + test/performance/routines/levelx/xomatcopy.cc | 36 - test/performance/routines/levelx/xomatcopy.cpp | 36 + 322 files changed, 15579 insertions(+), 15578 deletions(-) delete mode 100644 samples/sgemm.cc create mode 100644 samples/sgemm.cpp delete mode 100644 src/cache.cc create mode 100644 src/cache.cpp delete mode 100644 src/clblast.cc create mode 100644 src/clblast.cpp delete mode 100644 src/clblast_c.cc create mode 100644 src/clblast_c.cpp delete mode 100644 src/database/database.cc create mode 100644 src/database/database.cpp delete mode 100644 src/routine.cc create mode 100644 src/routine.cpp delete mode 100644 src/routines/common.cc create mode 100644 src/routines/common.cpp delete mode 100644 src/routines/level1/xamax.cc create mode 100644 src/routines/level1/xamax.cpp delete mode 100644 src/routines/level1/xasum.cc create mode 100644 src/routines/level1/xasum.cpp delete mode 100644 src/routines/level1/xaxpy.cc create mode 100644 src/routines/level1/xaxpy.cpp delete mode 100644 src/routines/level1/xcopy.cc create mode 100644 src/routines/level1/xcopy.cpp delete mode 100644 src/routines/level1/xdot.cc create mode 100644 src/routines/level1/xdot.cpp delete mode 100644 src/routines/level1/xdotc.cc create mode 100644 src/routines/level1/xdotc.cpp delete mode 100644 src/routines/level1/xdotu.cc create mode 100644 src/routines/level1/xdotu.cpp delete mode 100644 src/routines/level1/xnrm2.cc create mode 100644 src/routines/level1/xnrm2.cpp delete mode 100644 src/routines/level1/xscal.cc create mode 100644 src/routines/level1/xscal.cpp delete mode 100644 src/routines/level1/xswap.cc create mode 100644 src/routines/level1/xswap.cpp delete mode 100644 src/routines/level2/xgbmv.cc create mode 100644 src/routines/level2/xgbmv.cpp delete mode 100644 src/routines/level2/xgemv.cc create mode 100644 src/routines/level2/xgemv.cpp delete mode 100644 src/routines/level2/xger.cc create mode 100644 src/routines/level2/xger.cpp delete mode 100644 src/routines/level2/xgerc.cc create mode 100644 src/routines/level2/xgerc.cpp delete mode 100644 src/routines/level2/xgeru.cc create mode 100644 src/routines/level2/xgeru.cpp delete mode 100644 src/routines/level2/xhbmv.cc create mode 100644 src/routines/level2/xhbmv.cpp delete mode 100644 src/routines/level2/xhemv.cc create mode 100644 src/routines/level2/xhemv.cpp delete mode 100644 src/routines/level2/xher.cc create mode 100644 src/routines/level2/xher.cpp delete mode 100644 src/routines/level2/xher2.cc create mode 100644 src/routines/level2/xher2.cpp delete mode 100644 src/routines/level2/xhpmv.cc create mode 100644 src/routines/level2/xhpmv.cpp delete mode 100644 src/routines/level2/xhpr.cc create mode 100644 src/routines/level2/xhpr.cpp delete mode 100644 src/routines/level2/xhpr2.cc create mode 100644 src/routines/level2/xhpr2.cpp delete mode 100644 src/routines/level2/xsbmv.cc create mode 100644 src/routines/level2/xsbmv.cpp delete mode 100644 src/routines/level2/xspmv.cc create mode 100644 src/routines/level2/xspmv.cpp delete mode 100644 src/routines/level2/xspr.cc create mode 100644 src/routines/level2/xspr.cpp delete mode 100644 src/routines/level2/xspr2.cc create mode 100644 src/routines/level2/xspr2.cpp delete mode 100644 src/routines/level2/xsymv.cc create mode 100644 src/routines/level2/xsymv.cpp delete mode 100644 src/routines/level2/xsyr.cc create mode 100644 src/routines/level2/xsyr.cpp delete mode 100644 src/routines/level2/xsyr2.cc create mode 100644 src/routines/level2/xsyr2.cpp delete mode 100644 src/routines/level2/xtbmv.cc create mode 100644 src/routines/level2/xtbmv.cpp delete mode 100644 src/routines/level2/xtpmv.cc create mode 100644 src/routines/level2/xtpmv.cpp delete mode 100644 src/routines/level2/xtrmv.cc create mode 100644 src/routines/level2/xtrmv.cpp delete mode 100644 src/routines/level3/xgemm.cc create mode 100644 src/routines/level3/xgemm.cpp delete mode 100644 src/routines/level3/xhemm.cc create mode 100644 src/routines/level3/xhemm.cpp delete mode 100644 src/routines/level3/xher2k.cc create mode 100644 src/routines/level3/xher2k.cpp delete mode 100644 src/routines/level3/xherk.cc create mode 100644 src/routines/level3/xherk.cpp delete mode 100644 src/routines/level3/xsymm.cc create mode 100644 src/routines/level3/xsymm.cpp delete mode 100644 src/routines/level3/xsyr2k.cc create mode 100644 src/routines/level3/xsyr2k.cpp delete mode 100644 src/routines/level3/xsyrk.cc create mode 100644 src/routines/level3/xsyrk.cpp delete mode 100644 src/routines/level3/xtrmm.cc create mode 100644 src/routines/level3/xtrmm.cpp delete mode 100644 src/routines/levelx/xomatcopy.cc create mode 100644 src/routines/levelx/xomatcopy.cpp delete mode 100644 src/tuning/kernels/copy_fast.cc create mode 100644 src/tuning/kernels/copy_fast.cpp delete mode 100644 src/tuning/kernels/copy_pad.cc create mode 100644 src/tuning/kernels/copy_pad.cpp delete mode 100644 src/tuning/kernels/transpose_fast.cc create mode 100644 src/tuning/kernels/transpose_fast.cpp delete mode 100644 src/tuning/kernels/transpose_pad.cc create mode 100644 src/tuning/kernels/transpose_pad.cpp delete mode 100644 src/tuning/kernels/xaxpy.cc create mode 100644 src/tuning/kernels/xaxpy.cpp delete mode 100644 src/tuning/kernels/xdot.cc create mode 100644 src/tuning/kernels/xdot.cpp delete mode 100644 src/tuning/kernels/xgemm.cc create mode 100644 src/tuning/kernels/xgemm.cpp delete mode 100644 src/tuning/kernels/xgemv.cc create mode 100644 src/tuning/kernels/xgemv.cpp delete mode 100644 src/tuning/kernels/xger.cc create mode 100644 src/tuning/kernels/xger.cpp delete mode 100644 src/utilities.cc create mode 100644 src/utilities.cpp delete mode 100644 test/correctness/routines/level1/xamax.cc create mode 100644 test/correctness/routines/level1/xamax.cpp delete mode 100644 test/correctness/routines/level1/xasum.cc create mode 100644 test/correctness/routines/level1/xasum.cpp delete mode 100644 test/correctness/routines/level1/xaxpy.cc create mode 100644 test/correctness/routines/level1/xaxpy.cpp delete mode 100644 test/correctness/routines/level1/xcopy.cc create mode 100644 test/correctness/routines/level1/xcopy.cpp delete mode 100644 test/correctness/routines/level1/xdot.cc create mode 100644 test/correctness/routines/level1/xdot.cpp delete mode 100644 test/correctness/routines/level1/xdotc.cc create mode 100644 test/correctness/routines/level1/xdotc.cpp delete mode 100644 test/correctness/routines/level1/xdotu.cc create mode 100644 test/correctness/routines/level1/xdotu.cpp delete mode 100644 test/correctness/routines/level1/xnrm2.cc create mode 100644 test/correctness/routines/level1/xnrm2.cpp delete mode 100644 test/correctness/routines/level1/xrot.cc create mode 100644 test/correctness/routines/level1/xrot.cpp delete mode 100644 test/correctness/routines/level1/xrotg.cc create mode 100644 test/correctness/routines/level1/xrotg.cpp delete mode 100644 test/correctness/routines/level1/xrotm.cc create mode 100644 test/correctness/routines/level1/xrotm.cpp delete mode 100644 test/correctness/routines/level1/xrotmg.cc create mode 100644 test/correctness/routines/level1/xrotmg.cpp delete mode 100644 test/correctness/routines/level1/xscal.cc create mode 100644 test/correctness/routines/level1/xscal.cpp delete mode 100644 test/correctness/routines/level1/xswap.cc create mode 100644 test/correctness/routines/level1/xswap.cpp delete mode 100644 test/correctness/routines/level2/xgbmv.cc create mode 100644 test/correctness/routines/level2/xgbmv.cpp delete mode 100644 test/correctness/routines/level2/xgemv.cc create mode 100644 test/correctness/routines/level2/xgemv.cpp delete mode 100644 test/correctness/routines/level2/xger.cc create mode 100644 test/correctness/routines/level2/xger.cpp delete mode 100644 test/correctness/routines/level2/xgerc.cc create mode 100644 test/correctness/routines/level2/xgerc.cpp delete mode 100644 test/correctness/routines/level2/xgeru.cc create mode 100644 test/correctness/routines/level2/xgeru.cpp delete mode 100644 test/correctness/routines/level2/xhbmv.cc create mode 100644 test/correctness/routines/level2/xhbmv.cpp delete mode 100644 test/correctness/routines/level2/xhemv.cc create mode 100644 test/correctness/routines/level2/xhemv.cpp delete mode 100644 test/correctness/routines/level2/xher.cc create mode 100644 test/correctness/routines/level2/xher.cpp delete mode 100644 test/correctness/routines/level2/xher2.cc create mode 100644 test/correctness/routines/level2/xher2.cpp delete mode 100644 test/correctness/routines/level2/xhpmv.cc create mode 100644 test/correctness/routines/level2/xhpmv.cpp delete mode 100644 test/correctness/routines/level2/xhpr.cc create mode 100644 test/correctness/routines/level2/xhpr.cpp delete mode 100644 test/correctness/routines/level2/xhpr2.cc create mode 100644 test/correctness/routines/level2/xhpr2.cpp delete mode 100644 test/correctness/routines/level2/xsbmv.cc create mode 100644 test/correctness/routines/level2/xsbmv.cpp delete mode 100644 test/correctness/routines/level2/xspmv.cc create mode 100644 test/correctness/routines/level2/xspmv.cpp delete mode 100644 test/correctness/routines/level2/xspr.cc create mode 100644 test/correctness/routines/level2/xspr.cpp delete mode 100644 test/correctness/routines/level2/xspr2.cc create mode 100644 test/correctness/routines/level2/xspr2.cpp delete mode 100644 test/correctness/routines/level2/xsymv.cc create mode 100644 test/correctness/routines/level2/xsymv.cpp delete mode 100644 test/correctness/routines/level2/xsyr.cc create mode 100644 test/correctness/routines/level2/xsyr.cpp delete mode 100644 test/correctness/routines/level2/xsyr2.cc create mode 100644 test/correctness/routines/level2/xsyr2.cpp delete mode 100644 test/correctness/routines/level2/xtbmv.cc create mode 100644 test/correctness/routines/level2/xtbmv.cpp delete mode 100644 test/correctness/routines/level2/xtbsv.cc create mode 100644 test/correctness/routines/level2/xtbsv.cpp delete mode 100644 test/correctness/routines/level2/xtpmv.cc create mode 100644 test/correctness/routines/level2/xtpmv.cpp delete mode 100644 test/correctness/routines/level2/xtpsv.cc create mode 100644 test/correctness/routines/level2/xtpsv.cpp delete mode 100644 test/correctness/routines/level2/xtrmv.cc create mode 100644 test/correctness/routines/level2/xtrmv.cpp delete mode 100644 test/correctness/routines/level2/xtrsv.cc create mode 100644 test/correctness/routines/level2/xtrsv.cpp delete mode 100644 test/correctness/routines/level3/xgemm.cc create mode 100644 test/correctness/routines/level3/xgemm.cpp delete mode 100644 test/correctness/routines/level3/xhemm.cc create mode 100644 test/correctness/routines/level3/xhemm.cpp delete mode 100644 test/correctness/routines/level3/xher2k.cc create mode 100644 test/correctness/routines/level3/xher2k.cpp delete mode 100644 test/correctness/routines/level3/xherk.cc create mode 100644 test/correctness/routines/level3/xherk.cpp delete mode 100644 test/correctness/routines/level3/xsymm.cc create mode 100644 test/correctness/routines/level3/xsymm.cpp delete mode 100644 test/correctness/routines/level3/xsyr2k.cc create mode 100644 test/correctness/routines/level3/xsyr2k.cpp delete mode 100644 test/correctness/routines/level3/xsyrk.cc create mode 100644 test/correctness/routines/level3/xsyrk.cpp delete mode 100644 test/correctness/routines/level3/xtrmm.cc create mode 100644 test/correctness/routines/level3/xtrmm.cpp delete mode 100644 test/correctness/routines/level3/xtrsm.cc create mode 100644 test/correctness/routines/level3/xtrsm.cpp delete mode 100644 test/correctness/routines/levelx/xomatcopy.cc create mode 100644 test/correctness/routines/levelx/xomatcopy.cpp delete mode 100644 test/correctness/testblas.cc create mode 100644 test/correctness/testblas.cpp delete mode 100644 test/correctness/tester.cc create mode 100644 test/correctness/tester.cpp delete mode 100644 test/performance/client.cc create mode 100644 test/performance/client.cpp delete mode 100644 test/performance/routines/level1/xamax.cc create mode 100644 test/performance/routines/level1/xamax.cpp delete mode 100644 test/performance/routines/level1/xasum.cc create mode 100644 test/performance/routines/level1/xasum.cpp delete mode 100644 test/performance/routines/level1/xaxpy.cc create mode 100644 test/performance/routines/level1/xaxpy.cpp delete mode 100644 test/performance/routines/level1/xcopy.cc create mode 100644 test/performance/routines/level1/xcopy.cpp delete mode 100644 test/performance/routines/level1/xdot.cc create mode 100644 test/performance/routines/level1/xdot.cpp delete mode 100644 test/performance/routines/level1/xdotc.cc create mode 100644 test/performance/routines/level1/xdotc.cpp delete mode 100644 test/performance/routines/level1/xdotu.cc create mode 100644 test/performance/routines/level1/xdotu.cpp delete mode 100644 test/performance/routines/level1/xnrm2.cc create mode 100644 test/performance/routines/level1/xnrm2.cpp delete mode 100644 test/performance/routines/level1/xrot.cc create mode 100644 test/performance/routines/level1/xrot.cpp delete mode 100644 test/performance/routines/level1/xrotg.cc create mode 100644 test/performance/routines/level1/xrotg.cpp delete mode 100644 test/performance/routines/level1/xrotm.cc create mode 100644 test/performance/routines/level1/xrotm.cpp delete mode 100644 test/performance/routines/level1/xrotmg.cc create mode 100644 test/performance/routines/level1/xrotmg.cpp delete mode 100644 test/performance/routines/level1/xscal.cc create mode 100644 test/performance/routines/level1/xscal.cpp delete mode 100644 test/performance/routines/level1/xswap.cc create mode 100644 test/performance/routines/level1/xswap.cpp delete mode 100644 test/performance/routines/level2/xgbmv.cc create mode 100644 test/performance/routines/level2/xgbmv.cpp delete mode 100644 test/performance/routines/level2/xgemv.cc create mode 100644 test/performance/routines/level2/xgemv.cpp delete mode 100644 test/performance/routines/level2/xger.cc create mode 100644 test/performance/routines/level2/xger.cpp delete mode 100644 test/performance/routines/level2/xgerc.cc create mode 100644 test/performance/routines/level2/xgerc.cpp delete mode 100644 test/performance/routines/level2/xgeru.cc create mode 100644 test/performance/routines/level2/xgeru.cpp delete mode 100644 test/performance/routines/level2/xhbmv.cc create mode 100644 test/performance/routines/level2/xhbmv.cpp delete mode 100644 test/performance/routines/level2/xhemv.cc create mode 100644 test/performance/routines/level2/xhemv.cpp delete mode 100644 test/performance/routines/level2/xher.cc create mode 100644 test/performance/routines/level2/xher.cpp delete mode 100644 test/performance/routines/level2/xher2.cc create mode 100644 test/performance/routines/level2/xher2.cpp delete mode 100644 test/performance/routines/level2/xhpmv.cc create mode 100644 test/performance/routines/level2/xhpmv.cpp delete mode 100644 test/performance/routines/level2/xhpr.cc create mode 100644 test/performance/routines/level2/xhpr.cpp delete mode 100644 test/performance/routines/level2/xhpr2.cc create mode 100644 test/performance/routines/level2/xhpr2.cpp delete mode 100644 test/performance/routines/level2/xsbmv.cc create mode 100644 test/performance/routines/level2/xsbmv.cpp delete mode 100644 test/performance/routines/level2/xspmv.cc create mode 100644 test/performance/routines/level2/xspmv.cpp delete mode 100644 test/performance/routines/level2/xspr.cc create mode 100644 test/performance/routines/level2/xspr.cpp delete mode 100644 test/performance/routines/level2/xspr2.cc create mode 100644 test/performance/routines/level2/xspr2.cpp delete mode 100644 test/performance/routines/level2/xsymv.cc create mode 100644 test/performance/routines/level2/xsymv.cpp delete mode 100644 test/performance/routines/level2/xsyr.cc create mode 100644 test/performance/routines/level2/xsyr.cpp delete mode 100644 test/performance/routines/level2/xsyr2.cc create mode 100644 test/performance/routines/level2/xsyr2.cpp delete mode 100644 test/performance/routines/level2/xtbmv.cc create mode 100644 test/performance/routines/level2/xtbmv.cpp delete mode 100644 test/performance/routines/level2/xtbsv.cc create mode 100644 test/performance/routines/level2/xtbsv.cpp delete mode 100644 test/performance/routines/level2/xtpmv.cc create mode 100644 test/performance/routines/level2/xtpmv.cpp delete mode 100644 test/performance/routines/level2/xtpsv.cc create mode 100644 test/performance/routines/level2/xtpsv.cpp delete mode 100644 test/performance/routines/level2/xtrmv.cc create mode 100644 test/performance/routines/level2/xtrmv.cpp delete mode 100644 test/performance/routines/level2/xtrsv.cc create mode 100644 test/performance/routines/level2/xtrsv.cpp delete mode 100644 test/performance/routines/level3/xgemm.cc create mode 100644 test/performance/routines/level3/xgemm.cpp delete mode 100644 test/performance/routines/level3/xhemm.cc create mode 100644 test/performance/routines/level3/xhemm.cpp delete mode 100644 test/performance/routines/level3/xher2k.cc create mode 100644 test/performance/routines/level3/xher2k.cpp delete mode 100644 test/performance/routines/level3/xherk.cc create mode 100644 test/performance/routines/level3/xherk.cpp delete mode 100644 test/performance/routines/level3/xsymm.cc create mode 100644 test/performance/routines/level3/xsymm.cpp delete mode 100644 test/performance/routines/level3/xsyr2k.cc create mode 100644 test/performance/routines/level3/xsyr2k.cpp delete mode 100644 test/performance/routines/level3/xsyrk.cc create mode 100644 test/performance/routines/level3/xsyrk.cpp delete mode 100644 test/performance/routines/level3/xtrmm.cc create mode 100644 test/performance/routines/level3/xtrmm.cpp delete mode 100644 test/performance/routines/level3/xtrsm.cc create mode 100644 test/performance/routines/level3/xtrsm.cpp delete mode 100644 test/performance/routines/levelx/xomatcopy.cc create mode 100644 test/performance/routines/levelx/xomatcopy.cpp diff --git a/CHANGELOG b/CHANGELOG index e9063b91..fd5b3610 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ Development version (next release) - Made it possible to compile the performance tests (clients) separately from the correctness tests - Made a reference BLAS and head-to-head performance comparison optional in the clients - Increased the verbosity of the "-verbose" option in the correctness tests +- Refactored the host code for better compilation times and fewer lines of code - Improved the API documentation - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ca56bd7..0df2b3bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -141,25 +141,25 @@ set(PRECISIONS 32 64 3232 6464) # Gathers all source-files set(SOURCES - src/database/database.cc - src/routines/common.cc - src/cache.cc - src/clblast.cc - src/clblast_c.cc - src/routine.cc - src/utilities.cc + src/database/database.cpp + src/routines/common.cpp + src/cache.cpp + src/clblast.cpp + src/clblast_c.cpp + src/routine.cpp + src/utilities.cpp ) foreach(ROUTINE ${LEVEL1_ROUTINES}) - set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc) + set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp) endforeach() foreach(ROUTINE ${LEVEL2_ROUTINES}) - set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cc) + set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cpp) endforeach() foreach(ROUTINE ${LEVEL3_ROUTINES}) - set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc) + set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cpp) endforeach() foreach(ROUTINE ${LEVELX_ROUTINES}) - set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cc) + set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cpp) endforeach() # Creates and links the library @@ -193,7 +193,7 @@ if(SAMPLES) # Adds sample programs (C++) foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP}) - add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cc) + add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp) target_link_libraries(clblast_sample_${SAMPLE} clblast ${OPENCL_LIBRARIES}) install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin) endforeach() @@ -218,7 +218,7 @@ if(TUNERS) # Adds tuning executables foreach(KERNEL ${KERNELS}) - add_executable(clblast_tuner_${KERNEL} src/tuning/kernels/${KERNEL}.cc) + add_executable(clblast_tuner_${KERNEL} src/tuning/kernels/${KERNEL}.cpp) target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES}) install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin) endforeach() @@ -275,24 +275,24 @@ endif() if(CLIENTS) # Creates the common performance-tests objects (requires CMake 2.8.8) - add_library(test_performance_common OBJECT test/performance/client.cc) + add_library(test_performance_common OBJECT test/performance/client.cpp) # Compiles the performance-tests foreach(ROUTINE ${LEVEL1_ROUTINES}) add_executable(clblast_client_${ROUTINE} $ - test/performance/routines/level1/${ROUTINE}.cc) + test/performance/routines/level1/${ROUTINE}.cpp) endforeach() foreach(ROUTINE ${LEVEL2_ROUTINES}) add_executable(clblast_client_${ROUTINE} $ - test/performance/routines/level2/${ROUTINE}.cc) + test/performance/routines/level2/${ROUTINE}.cpp) endforeach() foreach(ROUTINE ${LEVEL3_ROUTINES}) add_executable(clblast_client_${ROUTINE} $ - test/performance/routines/level3/${ROUTINE}.cc) + test/performance/routines/level3/${ROUTINE}.cpp) endforeach() foreach(ROUTINE ${LEVELX_ROUTINES}) add_executable(clblast_client_${ROUTINE} $ - test/performance/routines/levelx/${ROUTINE}.cc) + test/performance/routines/levelx/${ROUTINE}.cpp) endforeach() foreach(ROUTINE ${ROUTINES}) target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) @@ -310,24 +310,24 @@ if(TESTS) # Creates the common correctness-tests objects (requires CMake 2.8.8) add_library(test_correctness_common OBJECT - test/correctness/tester.cc test/correctness/testblas.cc) + test/correctness/tester.cpp test/correctness/testblas.cpp) # Compiles the correctness-tests foreach(ROUTINE ${LEVEL1_ROUTINES}) add_executable(clblast_test_${ROUTINE} $ - test/correctness/routines/level1/${ROUTINE}.cc) + test/correctness/routines/level1/${ROUTINE}.cpp) endforeach() foreach(ROUTINE ${LEVEL2_ROUTINES}) add_executable(clblast_test_${ROUTINE} $ - test/correctness/routines/level2/${ROUTINE}.cc) + test/correctness/routines/level2/${ROUTINE}.cpp) endforeach() foreach(ROUTINE ${LEVEL3_ROUTINES}) add_executable(clblast_test_${ROUTINE} $ - test/correctness/routines/level3/${ROUTINE}.cc) + test/correctness/routines/level3/${ROUTINE}.cpp) endforeach() foreach(ROUTINE ${LEVELX_ROUTINES}) add_executable(clblast_test_${ROUTINE} $ - test/correctness/routines/levelx/${ROUTINE}.cc) + test/correctness/routines/levelx/${ROUTINE}.cpp) endforeach() foreach(ROUTINE ${ROUTINES}) target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) diff --git a/README.md b/README.md index 26dfb149..5d2c0c9e 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ Note that CLBlast's tuners are based on the CLTune auto-tuning library, which ha Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake. -The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.h` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl). +The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl). In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder): diff --git a/samples/sgemm.cc b/samples/sgemm.cc deleted file mode 100644 index 5fe7490a..00000000 --- a/samples/sgemm.cc +++ /dev/null @@ -1,107 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file demonstrates the use of the SGEMM routine. It is a stand-alone example, but it does -// require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++ -// features, but CLBlast can also be used using the regular C-style OpenCL API. -// -// Note that this example is meant for illustration purposes only. CLBlast provides other programs -// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). -// -// ================================================================================================= - -#include -#include -#include - -// Includes the C++ OpenCL API. If not yet available, it can be found here: -// https://www.khronos.org/registry/cl/api/1.1/cl.hpp -#include "cl.hpp" - -// Includes the CLBlast library -#include - -// ================================================================================================= - -// Example use of the single-precision Xgemm routine SGEMM -int main() { - - // OpenCL platform/device settings - const auto platform_id = 0; - const auto device_id = 0; - - // Example SGEMM arguments - const size_t m = 128; - const size_t n = 64; - const size_t k = 512; - const float alpha = 0.7f; - const float beta = 1.0f; - const auto a_ld = k; - const auto b_ld = n; - const auto c_ld = n; - - // Initializes the OpenCL platform - auto platforms = std::vector(); - cl::Platform::get(&platforms); - if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } - auto platform = platforms[platform_id]; - - // Initializes the OpenCL device - auto devices = std::vector(); - platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); - if (devices.size() == 0 || device_id >= devices.size()) { return 1; } - auto device = devices[device_id]; - - // Creates the OpenCL context, queue, and an event - auto context = cl::Context({device}); - auto queue = cl::CommandQueue(context, device); - auto event = cl_event{nullptr}; - - // Populate host matrices with some example data - auto host_a = std::vector(m*k); - auto host_b = std::vector(n*k); - auto host_c = std::vector(m*n); - for (auto &item: host_a) { item = 12.193f; } - for (auto &item: host_b) { item = -8.199f; } - for (auto &item: host_c) { item = 0.0f; } - - // Copy the matrices to the device - auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(float)); - auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(float)); - auto device_c = cl::Buffer(context, CL_MEM_READ_WRITE, host_c.size()*sizeof(float)); - queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(float), host_a.data()); - queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(float), host_b.data()); - queue.enqueueWriteBuffer(device_c, CL_TRUE, 0, host_c.size()*sizeof(float), host_c.data()); - - // Start the timer - auto start_time = std::chrono::steady_clock::now(); - - // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision. - auto queue_plain = queue(); - auto status = clblast::Gemm(clblast::Layout::kRowMajor, - clblast::Transpose::kNo, clblast::Transpose::kNo, - m, n, k, - alpha, - device_a(), 0, a_ld, - device_b(), 0, b_ld, - beta, - device_c(), 0, c_ld, - &queue_plain, &event); - - // Record the execution time - clWaitForEvents(1, &event); - auto elapsed_time = std::chrono::steady_clock::now() - start_time; - auto time_ms = std::chrono::duration(elapsed_time).count(); - - // Example completed. See "clblast.h" for status codes (0 -> success). - printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, status); - return 0; -} - -// ================================================================================================= diff --git a/samples/sgemm.cpp b/samples/sgemm.cpp new file mode 100644 index 00000000..5fe7490a --- /dev/null +++ b/samples/sgemm.cpp @@ -0,0 +1,107 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file demonstrates the use of the SGEMM routine. It is a stand-alone example, but it does +// require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++ +// features, but CLBlast can also be used using the regular C-style OpenCL API. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include +#include +#include + +// Includes the C++ OpenCL API. If not yet available, it can be found here: +// https://www.khronos.org/registry/cl/api/1.1/cl.hpp +#include "cl.hpp" + +// Includes the CLBlast library +#include + +// ================================================================================================= + +// Example use of the single-precision Xgemm routine SGEMM +int main() { + + // OpenCL platform/device settings + const auto platform_id = 0; + const auto device_id = 0; + + // Example SGEMM arguments + const size_t m = 128; + const size_t n = 64; + const size_t k = 512; + const float alpha = 0.7f; + const float beta = 1.0f; + const auto a_ld = k; + const auto b_ld = n; + const auto c_ld = n; + + // Initializes the OpenCL platform + auto platforms = std::vector(); + cl::Platform::get(&platforms); + if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } + auto platform = platforms[platform_id]; + + // Initializes the OpenCL device + auto devices = std::vector(); + platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); + if (devices.size() == 0 || device_id >= devices.size()) { return 1; } + auto device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + auto context = cl::Context({device}); + auto queue = cl::CommandQueue(context, device); + auto event = cl_event{nullptr}; + + // Populate host matrices with some example data + auto host_a = std::vector(m*k); + auto host_b = std::vector(n*k); + auto host_c = std::vector(m*n); + for (auto &item: host_a) { item = 12.193f; } + for (auto &item: host_b) { item = -8.199f; } + for (auto &item: host_c) { item = 0.0f; } + + // Copy the matrices to the device + auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(float)); + auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(float)); + auto device_c = cl::Buffer(context, CL_MEM_READ_WRITE, host_c.size()*sizeof(float)); + queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(float), host_a.data()); + queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(float), host_b.data()); + queue.enqueueWriteBuffer(device_c, CL_TRUE, 0, host_c.size()*sizeof(float), host_c.data()); + + // Start the timer + auto start_time = std::chrono::steady_clock::now(); + + // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision. + auto queue_plain = queue(); + auto status = clblast::Gemm(clblast::Layout::kRowMajor, + clblast::Transpose::kNo, clblast::Transpose::kNo, + m, n, k, + alpha, + device_a(), 0, a_ld, + device_b(), 0, b_ld, + beta, + device_c(), 0, c_ld, + &queue_plain, &event); + + // Record the execution time + clWaitForEvents(1, &event); + auto elapsed_time = std::chrono::steady_clock::now() - start_time; + auto time_ms = std::chrono::duration(elapsed_time).count(); + + // Example completed. See "clblast.h" for status codes (0 -> success). + printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, status); + return 0; +} + +// ================================================================================================= diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 1df4c8e6..cf01f79e 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -10,14 +10,14 @@ # This script automatically generates the bodies of the following files, creating the full CLBlast # API interface and implementation (C, C++, and reference BLAS wrappers): # clblast.h -# clblast.cc +# clblast.cpp # clblast_c.h -# clblast_c.cc +# clblast_c.cpp # wrapper_clblas.h # wrapper_cblas.h # It also generates the main functions for the correctness and performance tests as found in -# test/correctness/routines/levelX/xYYYY.cc -# test/performance/routines/levelX/xYYYY.cc +# test/correctness/routines/levelX/xYYYY.cpp +# test/performance/routines/levelX/xYYYY.cpp # It also produces the API documentation found in doc/clblast.md # # ================================================================================================== @@ -200,7 +200,7 @@ def clblast_h(routines): result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n" return result -# The C++ API implementation (.cc) +# The C++ API implementation (.cpp) def clblast_cc(routines): result = "" for routine in routines: @@ -237,7 +237,7 @@ def clblast_c_h(routines): result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n" return result -# The C API implementation (.cc) +# The C API implementation (.cpp) def clblast_c_cc(routines): result = "" for routine in routines: @@ -379,9 +379,9 @@ if len(sys.argv) != 2: path_clblast = sys.argv[1] files = [ path_clblast+"/include/clblast.h", - path_clblast+"/src/clblast.cc", + path_clblast+"/src/clblast.cpp", path_clblast+"/include/clblast_c.h", - path_clblast+"/src/clblast_c.cc", + path_clblast+"/src/clblast_c.cpp", path_clblast+"/test/wrapper_clblas.hpp", path_clblast+"/test/wrapper_cblas.hpp", ] @@ -433,7 +433,7 @@ for i in xrange(0,len(files)): for level in [1,2,3,4]: for routine in routines[level-1]: if routine.has_tests: - filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cc" + filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp" with open(filename, "w") as f: body = "" body += "#include \"test/correctness/testblas.hpp\"\n" @@ -459,7 +459,7 @@ for level in [1,2,3,4]: for level in [1,2,3,4]: for routine in routines[level-1]: if routine.has_tests: - filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cc" + filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp" with open(filename, "w") as f: body = "" body += "#include \"test/performance/client.hpp\"\n" diff --git a/src/cache.cc b/src/cache.cc deleted file mode 100644 index cd9055d0..00000000 --- a/src/cache.cc +++ /dev/null @@ -1,111 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the caching functionality of compiled binaries and programs. -// -// ================================================================================================= - -#include -#include -#include - -#include "cache.hpp" - -namespace clblast { -// ================================================================================================= - -// Stores the compiled binary or IR in the cache -void StoreBinaryToCache(const std::string &binary, const std::string &device_name, - const Precision &precision, const std::string &routine_name) { - binary_cache_mutex_.lock(); - binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name}); - binary_cache_mutex_.unlock(); -} - -// Stores the compiled program in the cache -void StoreProgramToCache(const Program &program, const Context &context, - const Precision &precision, const std::string &routine_name) { - program_cache_mutex_.lock(); - program_cache_.push_back(ProgramCache{program, context.pointer(), precision, routine_name}); - program_cache_mutex_.unlock(); -} - -// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws -// otherwise. -const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name) { - binary_cache_mutex_.lock(); - for (auto &cached_binary: binary_cache_) { - if (cached_binary.MatchInCache(device_name, precision, routine_name)) { - binary_cache_mutex_.unlock(); - return cached_binary.binary; - } - } - binary_cache_mutex_.unlock(); - throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none."); -} - -// Queries the cache and retrieves a matching program. Assumes that the match is available, throws -// otherwise. -const Program& GetProgramFromCache(const Context &context, const Precision &precision, - const std::string &routine_name) { - program_cache_mutex_.lock(); - for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) { - program_cache_mutex_.unlock(); - return cached_program.program; - } - } - program_cache_mutex_.unlock(); - throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none."); -} - -// Queries the cache to see whether or not the compiled kernel is already there -bool BinaryIsInCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name) { - binary_cache_mutex_.lock(); - for (auto &cached_binary: binary_cache_) { - if (cached_binary.MatchInCache(device_name, precision, routine_name)) { - binary_cache_mutex_.unlock(); - return true; - } - } - binary_cache_mutex_.unlock(); - return false; -} - -// Queries the cache to see whether or not the compiled kernel is already there -bool ProgramIsInCache(const Context &context, const Precision &precision, - const std::string &routine_name) { - program_cache_mutex_.lock(); - for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) { - program_cache_mutex_.unlock(); - return true; - } - } - program_cache_mutex_.unlock(); - return false; -} - -// ================================================================================================= - -// Clears the cache of stored binaries and programs -StatusCode CacheClearAll() { - binary_cache_mutex_.lock(); - binary_cache_.clear(); - binary_cache_mutex_.unlock(); - program_cache_mutex_.lock(); - program_cache_.clear(); - program_cache_mutex_.unlock(); - return StatusCode::kSuccess; -} - -// ================================================================================================= -} // namespace clblast diff --git a/src/cache.cpp b/src/cache.cpp new file mode 100644 index 00000000..cd9055d0 --- /dev/null +++ b/src/cache.cpp @@ -0,0 +1,111 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the caching functionality of compiled binaries and programs. +// +// ================================================================================================= + +#include +#include +#include + +#include "cache.hpp" + +namespace clblast { +// ================================================================================================= + +// Stores the compiled binary or IR in the cache +void StoreBinaryToCache(const std::string &binary, const std::string &device_name, + const Precision &precision, const std::string &routine_name) { + binary_cache_mutex_.lock(); + binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name}); + binary_cache_mutex_.unlock(); +} + +// Stores the compiled program in the cache +void StoreProgramToCache(const Program &program, const Context &context, + const Precision &precision, const std::string &routine_name) { + program_cache_mutex_.lock(); + program_cache_.push_back(ProgramCache{program, context.pointer(), precision, routine_name}); + program_cache_mutex_.unlock(); +} + +// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws +// otherwise. +const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name) { + binary_cache_mutex_.lock(); + for (auto &cached_binary: binary_cache_) { + if (cached_binary.MatchInCache(device_name, precision, routine_name)) { + binary_cache_mutex_.unlock(); + return cached_binary.binary; + } + } + binary_cache_mutex_.unlock(); + throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none."); +} + +// Queries the cache and retrieves a matching program. Assumes that the match is available, throws +// otherwise. +const Program& GetProgramFromCache(const Context &context, const Precision &precision, + const std::string &routine_name) { + program_cache_mutex_.lock(); + for (auto &cached_program: program_cache_) { + if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) { + program_cache_mutex_.unlock(); + return cached_program.program; + } + } + program_cache_mutex_.unlock(); + throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none."); +} + +// Queries the cache to see whether or not the compiled kernel is already there +bool BinaryIsInCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name) { + binary_cache_mutex_.lock(); + for (auto &cached_binary: binary_cache_) { + if (cached_binary.MatchInCache(device_name, precision, routine_name)) { + binary_cache_mutex_.unlock(); + return true; + } + } + binary_cache_mutex_.unlock(); + return false; +} + +// Queries the cache to see whether or not the compiled kernel is already there +bool ProgramIsInCache(const Context &context, const Precision &precision, + const std::string &routine_name) { + program_cache_mutex_.lock(); + for (auto &cached_program: program_cache_) { + if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) { + program_cache_mutex_.unlock(); + return true; + } + } + program_cache_mutex_.unlock(); + return false; +} + +// ================================================================================================= + +// Clears the cache of stored binaries and programs +StatusCode CacheClearAll() { + binary_cache_mutex_.lock(); + binary_cache_.clear(); + binary_cache_mutex_.unlock(); + program_cache_mutex_.lock(); + program_cache_.clear(); + program_cache_mutex_.unlock(); + return StatusCode::kSuccess; +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/clblast.cc b/src/clblast.cc deleted file mode 100644 index 88d60772..00000000 --- a/src/clblast.cc +++ /dev/null @@ -1,2193 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements all the BLAS API calls. In all cases, it does not much more than creating -// a new object of the appropriate type, and calling the main routine on that object. It forwards -// all status codes to the caller. -// -// ================================================================================================= - -#include - -#include "clblast.h" -#include "public_api.hpp" -#include "cache.hpp" - -// BLAS level-1 includes -#include "routines/level1/xswap.hpp" -#include "routines/level1/xscal.hpp" -#include "routines/level1/xcopy.hpp" -#include "routines/level1/xaxpy.hpp" -#include "routines/level1/xdot.hpp" -#include "routines/level1/xdotu.hpp" -#include "routines/level1/xdotc.hpp" -#include "routines/level1/xnrm2.hpp" -#include "routines/level1/xasum.hpp" -#include "routines/level1/xsum.hpp" // non-BLAS routine -#include "routines/level1/xamax.hpp" -#include "routines/level1/xmax.hpp" // non-BLAS routine -#include "routines/level1/xmin.hpp" // non-BLAS routine - -// BLAS level-2 includes -#include "routines/level2/xgemv.hpp" -#include "routines/level2/xgbmv.hpp" -#include "routines/level2/xhemv.hpp" -#include "routines/level2/xhbmv.hpp" -#include "routines/level2/xhpmv.hpp" -#include "routines/level2/xsymv.hpp" -#include "routines/level2/xsbmv.hpp" -#include "routines/level2/xspmv.hpp" -#include "routines/level2/xtrmv.hpp" -#include "routines/level2/xtbmv.hpp" -#include "routines/level2/xtpmv.hpp" -#include "routines/level2/xger.hpp" -#include "routines/level2/xgeru.hpp" -#include "routines/level2/xgerc.hpp" -#include "routines/level2/xher.hpp" -#include "routines/level2/xhpr.hpp" -#include "routines/level2/xher2.hpp" -#include "routines/level2/xhpr2.hpp" -#include "routines/level2/xsyr.hpp" -#include "routines/level2/xspr.hpp" -#include "routines/level2/xsyr2.hpp" -#include "routines/level2/xspr2.hpp" - -// BLAS level-3 includes -#include "routines/level3/xgemm.hpp" -#include "routines/level3/xsymm.hpp" -#include "routines/level3/xhemm.hpp" -#include "routines/level3/xsyrk.hpp" -#include "routines/level3/xherk.hpp" -#include "routines/level3/xsyr2k.hpp" -#include "routines/level3/xher2k.hpp" -#include "routines/level3/xtrmm.hpp" - -// Level-x includes (non-BLAS) -#include "routines/levelx/xomatcopy.hpp" - -namespace clblast { - -// ================================================================================================= -// BLAS level-1 (vector-vector) routines -// ================================================================================================= - -// Generate givens plane rotation: SROTG/DROTG -template -StatusCode Rotg(cl_mem, const size_t, - cl_mem, const size_t, - cl_mem, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; -} -template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, - cl_mem, const size_t, - cl_mem, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, - cl_mem, const size_t, - cl_mem, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); - -// Generate modified givens plane rotation: SROTMG/DROTMG -template -StatusCode Rotmg(cl_mem, const size_t, - cl_mem, const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; -} -template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, - cl_mem, const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, - cl_mem, const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); - -// Apply givens plane rotation: SROT/DROT -template -StatusCode Rot(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - const T, - const T, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; -} -template StatusCode PUBLIC_API Rot(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - const float, - const float, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Rot(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - const double, - const double, - cl_command_queue*, cl_event*); - -// Apply modified givens plane rotation: SROTM/DROTM -template -StatusCode Rotm(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; -} -template StatusCode PUBLIC_API Rotm(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Rotm(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); - -// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP -template -StatusCode Swap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xswap(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSwap(n, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Swap(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Swap(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Swap(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Swap(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Swap(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL -template -StatusCode Scal(const size_t n, - const T alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xscal(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoScal(n, - alpha, - Buffer(x_buffer), x_offset, x_inc); -} -template StatusCode PUBLIC_API Scal(const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Scal(const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Scal(const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Scal(const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Scal(const size_t, - const half, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY -template -StatusCode Copy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xcopy(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoCopy(n, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Copy(const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Copy(const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Copy(const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Copy(const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Copy(const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY -template -StatusCode Axpy(const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xaxpy(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoAxpy(n, - alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Axpy(const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Axpy(const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Axpy(const size_t, - const float2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Axpy(const size_t, - const double2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Axpy(const size_t, - const half, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Dot product of two vectors: SDOT/DDOT/HDOT -template -StatusCode Dot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xdot(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoDot(n, - Buffer(dot_buffer), dot_offset, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Dot(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Dot(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Dot(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Dot product of two complex vectors: CDOTU/ZDOTU -template -StatusCode Dotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xdotu(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoDotu(n, - Buffer(dot_buffer), dot_offset, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Dotu(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Dotu(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC -template -StatusCode Dotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xdotc(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoDotc(n, - Buffer(dot_buffer), dot_offset, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Dotc(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Dotc(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 -template -StatusCode Nrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xnrm2(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoNrm2(n, - Buffer(nrm2_buffer), nrm2_offset, - Buffer(x_buffer), x_offset, x_inc); -} -template StatusCode PUBLIC_API Nrm2(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Nrm2(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Nrm2(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Nrm2(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Nrm2(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM -template -StatusCode Asum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xasum(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoAsum(n, - Buffer(asum_buffer), asum_offset, - Buffer(x_buffer), x_offset, x_inc); -} -template StatusCode PUBLIC_API Asum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Asum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Asum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Asum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Asum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM -template -StatusCode Sum(const size_t n, - cl_mem sum_buffer, const size_t sum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsum(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSum(n, - Buffer(sum_buffer), sum_offset, - Buffer(x_buffer), x_offset, x_inc); -} -template StatusCode PUBLIC_API Sum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Sum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Sum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Sum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Sum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX -template -StatusCode Amax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xamax(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoAmax(n, - Buffer(imax_buffer), imax_offset, - Buffer(x_buffer), x_offset, x_inc); -} -template StatusCode PUBLIC_API Amax(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Amax(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Amax(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Amax(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Amax(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX -template -StatusCode Max(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xmax(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoMax(n, - Buffer(imax_buffer), imax_offset, - Buffer(x_buffer), x_offset, x_inc); -} -template StatusCode PUBLIC_API Max(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Max(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Max(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Max(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Max(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN -template -StatusCode Min(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xmin(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoMin(n, - Buffer(imin_buffer), imin_offset, - Buffer(x_buffer), x_offset, x_inc); -} -template StatusCode PUBLIC_API Min(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Min(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Min(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Min(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Min(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// ================================================================================================= -// BLAS level-2 (matrix-vector) routines -// ================================================================================================= - -// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV -template -StatusCode Gemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xgemv(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoGemv(layout, a_transpose, - m, n, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, - beta, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, - const size_t, const size_t, - const half, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const half, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV -template -StatusCode Gbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xgbmv(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoGbmv(layout, a_transpose, - m, n, kl, ku, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, - beta, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, - const size_t, const size_t, const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, - const size_t, const size_t, const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, - const size_t, const size_t, const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, - const size_t, const size_t, const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, - const size_t, const size_t, const size_t, const size_t, - const half, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const half, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Hermitian matrix-vector multiplication: CHEMV/ZHEMV -template -StatusCode Hemv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xhemv(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHemv(layout, triangle, - n, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, - beta, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, - const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, - const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV -template -StatusCode Hbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xhbmv(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHbmv(layout, triangle, - n, k, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, - beta, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV -template -StatusCode Hpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xhpmv(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHpmv(layout, triangle, - n, - alpha, - Buffer(ap_buffer), ap_offset, - Buffer(x_buffer), x_offset, x_inc, - beta, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, - const size_t, - const float2, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, - const size_t, - const double2, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV -template -StatusCode Symv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsymv(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSymv(layout, triangle, - n, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, - beta, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Symv(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Symv(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Symv(const Layout, const Triangle, - const size_t, - const half, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const half, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV -template -StatusCode Sbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsbmv(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSbmv(layout, triangle, - n, k, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, - beta, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, - const size_t, const size_t, - const half, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const half, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV -template -StatusCode Spmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xspmv(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSpmv(layout, triangle, - n, - alpha, - Buffer(ap_buffer), ap_offset, - Buffer(x_buffer), x_offset, x_inc, - beta, - Buffer(y_buffer), y_offset, y_inc); -} -template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, - const size_t, - const half, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const half, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV -template -StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xtrmv(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoTrmv(layout, triangle, a_transpose, diagonal, - n, - Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc); -} -template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV -template -StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xtbmv(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoTbmv(layout, triangle, a_transpose, diagonal, - n, k, - Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc); -} -template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV -template -StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xtpmv(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoTpmv(layout, triangle, a_transpose, diagonal, - n, - Buffer(ap_buffer), ap_offset, - Buffer(x_buffer), x_offset, x_inc); -} -template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV -template -StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; -} -template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV -template -StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; -} -template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV -template -StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; -} -template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// General rank-1 matrix update: SGER/DGER/HGER -template -StatusCode Ger(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xger(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoGer(layout, - m, n, - alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc, - Buffer(a_buffer), a_offset, a_ld); -} -template StatusCode PUBLIC_API Ger(const Layout, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Ger(const Layout, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Ger(const Layout, - const size_t, const size_t, - const half, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// General rank-1 complex matrix update: CGERU/ZGERU -template -StatusCode Geru(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xgeru(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoGeru(layout, - m, n, - alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc, - Buffer(a_buffer), a_offset, a_ld); -} -template StatusCode PUBLIC_API Geru(const Layout, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Geru(const Layout, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// General rank-1 complex conjugated matrix update: CGERC/ZGERC -template -StatusCode Gerc(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xgerc(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoGerc(layout, - m, n, - alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc, - Buffer(a_buffer), a_offset, a_ld); -} -template StatusCode PUBLIC_API Gerc(const Layout, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gerc(const Layout, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Hermitian rank-1 matrix update: CHER/ZHER -template -StatusCode Her(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xher,T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHer(layout, triangle, - n, - alpha, - Buffer>(x_buffer), x_offset, x_inc, - Buffer>(a_buffer), a_offset, a_ld); -} -template StatusCode PUBLIC_API Her(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Her(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Hermitian packed rank-1 matrix update: CHPR/ZHPR -template -StatusCode Hpr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xhpr,T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHpr(layout, triangle, - n, - alpha, - Buffer>(x_buffer), x_offset, x_inc, - Buffer>(ap_buffer), ap_offset); -} -template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); - -// Hermitian rank-2 matrix update: CHER2/ZHER2 -template -StatusCode Her2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xher2(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHer2(layout, triangle, - n, - alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc, - Buffer(a_buffer), a_offset, a_ld); -} -template StatusCode PUBLIC_API Her2(const Layout, const Triangle, - const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Her2(const Layout, const Triangle, - const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 -template -StatusCode Hpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xhpr2(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHpr2(layout, triangle, - n, - alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc, - Buffer(ap_buffer), ap_offset); -} -template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, - const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, - const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); - -// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR -template -StatusCode Syr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsyr(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSyr(layout, triangle, - n, - alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(a_buffer), a_offset, a_ld); -} -template StatusCode PUBLIC_API Syr(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Syr(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Syr(const Layout, const Triangle, - const size_t, - const half, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR -template -StatusCode Spr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xspr(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSpr(layout, triangle, - n, - alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(ap_buffer), ap_offset); -} -template StatusCode PUBLIC_API Spr(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Spr(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Spr(const Layout, const Triangle, - const size_t, - const half, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); - -// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 -template -StatusCode Syr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsyr2(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSyr2(layout, triangle, - n, - alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc, - Buffer(a_buffer), a_offset, a_ld); -} -template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, - const size_t, - const half, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 -template -StatusCode Spr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xspr2(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSpr2(layout, triangle, - n, - alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc, - Buffer(ap_buffer), ap_offset); -} -template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, - const size_t, - const half, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); - -// ================================================================================================= -// BLAS level-3 (matrix-matrix) routines -// ================================================================================================= - -// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM -template -StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xgemm(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoGemm(layout, a_transpose, b_transpose, - m, n, k, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, - beta, - Buffer(c_buffer), c_offset, c_ld); -} -template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, - const half, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const half, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM -template -StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsymm(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSymm(layout, side, triangle, - m, n, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, - beta, - Buffer(c_buffer), c_offset, c_ld); -} -template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const half, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const half, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM -template -StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xhemm(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHemm(layout, side, triangle, - m, n, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, - beta, - Buffer(c_buffer), c_offset, c_ld); -} -template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK -template -StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsyrk(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSyrk(layout, triangle, a_transpose, - n, k, - alpha, - Buffer(a_buffer), a_offset, a_ld, - beta, - Buffer(c_buffer), c_offset, c_ld); -} -template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const half, - const cl_mem, const size_t, const size_t, - const half, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Rank-K update of a hermitian matrix: CHERK/ZHERK -template -StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xherk,T>(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHerk(layout, triangle, a_transpose, - n, k, - alpha, - Buffer>(a_buffer), a_offset, a_ld, - beta, - Buffer>(c_buffer), c_offset, c_ld); -} -template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K -template -StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xsyr2k(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoSyr2k(layout, triangle, ab_transpose, - n, k, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, - beta, - Buffer(c_buffer), c_offset, c_ld); -} -template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const half, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const half, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K -template -StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const U beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xher2k(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoHer2k(layout, triangle, ab_transpose, - n, k, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, - beta, - Buffer(c_buffer), c_offset, c_ld); -} -template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM -template -StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xtrmm(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, - m, n, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld); -} -template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const half, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM -template -StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const T, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; -} -template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const half, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// ================================================================================================= -// Extra non-BLAS routines (level-X) -// ================================================================================================= - -// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY -template -StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto routine = Xomatcopy(queue_cpp, event); - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - return routine.DoOmatcopy(layout, a_transpose, - m, n, - alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld); -} -template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, - const size_t, const size_t, - const half, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); - -// ================================================================================================= - -// Clears the cache of stored binaries -StatusCode ClearCache() { return CacheClearAll(); } - -// Fills the cache with all binaries for a specific device -// TODO: Add half-precision FP16 set-up calls -StatusCode FillCache(const cl_device_id device) { - try { - - // Creates a sample context and queue to match the normal routine calling conventions - auto device_cpp = Device(device); - auto context = Context(device_cpp); - auto queue = Queue(context, device_cpp); - - // Runs all the level 1 set-up functions - Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); - Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); - Xscal(queue, nullptr).SetUp(); Xscal(queue, nullptr).SetUp(); Xscal(queue, nullptr).SetUp(); Xscal(queue, nullptr).SetUp(); - Xcopy(queue, nullptr).SetUp(); Xcopy(queue, nullptr).SetUp(); Xcopy(queue, nullptr).SetUp(); Xcopy(queue, nullptr).SetUp(); - Xaxpy(queue, nullptr).SetUp(); Xaxpy(queue, nullptr).SetUp(); Xaxpy(queue, nullptr).SetUp(); Xaxpy(queue, nullptr).SetUp(); - Xdot(queue, nullptr).SetUp(); Xdot(queue, nullptr).SetUp(); - Xdotu(queue, nullptr).SetUp(); Xdotu(queue, nullptr).SetUp(); - Xdotc(queue, nullptr).SetUp(); Xdotc(queue, nullptr).SetUp(); - Xnrm2(queue, nullptr).SetUp(); Xnrm2(queue, nullptr).SetUp(); Xnrm2(queue, nullptr).SetUp(); Xnrm2(queue, nullptr).SetUp(); - Xasum(queue, nullptr).SetUp(); Xasum(queue, nullptr).SetUp(); Xasum(queue, nullptr).SetUp(); Xasum(queue, nullptr).SetUp(); - Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); - Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); - Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); - Xmin(queue, nullptr).SetUp(); Xmin(queue, nullptr).SetUp(); Xmin(queue, nullptr).SetUp(); Xmin(queue, nullptr).SetUp(); - - // Runs all the level 2 set-up functions - Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); - Xgbmv(queue, nullptr).SetUp(); Xgbmv(queue, nullptr).SetUp(); Xgbmv(queue, nullptr).SetUp(); Xgbmv(queue, nullptr).SetUp(); - Xhemv(queue, nullptr).SetUp(); Xhemv(queue, nullptr).SetUp(); - Xhbmv(queue, nullptr).SetUp(); Xhbmv(queue, nullptr).SetUp(); - Xhpmv(queue, nullptr).SetUp(); Xhpmv(queue, nullptr).SetUp(); - Xsymv(queue, nullptr).SetUp(); Xsymv(queue, nullptr).SetUp(); - Xsbmv(queue, nullptr).SetUp(); Xsbmv(queue, nullptr).SetUp(); - Xspmv(queue, nullptr).SetUp(); Xspmv(queue, nullptr).SetUp(); - Xtrmv(queue, nullptr).SetUp(); Xtrmv(queue, nullptr).SetUp(); Xtrmv(queue, nullptr).SetUp(); Xtrmv(queue, nullptr).SetUp(); - Xtbmv(queue, nullptr).SetUp(); Xtbmv(queue, nullptr).SetUp(); Xtbmv(queue, nullptr).SetUp(); Xtbmv(queue, nullptr).SetUp(); - Xtpmv(queue, nullptr).SetUp(); Xtpmv(queue, nullptr).SetUp(); Xtpmv(queue, nullptr).SetUp(); Xtpmv(queue, nullptr).SetUp(); - Xger(queue, nullptr).SetUp(); Xger(queue, nullptr).SetUp(); - Xgeru(queue, nullptr).SetUp(); Xgeru(queue, nullptr).SetUp(); - Xgerc(queue, nullptr).SetUp(); Xgerc(queue, nullptr).SetUp(); - Xher(queue, nullptr).SetUp(); Xher(queue, nullptr).SetUp(); - Xhpr(queue, nullptr).SetUp(); Xhpr(queue, nullptr).SetUp(); - Xher2(queue, nullptr).SetUp(); Xher2(queue, nullptr).SetUp(); - Xhpr2(queue, nullptr).SetUp(); Xhpr2(queue, nullptr).SetUp(); - Xsyr(queue, nullptr).SetUp(); Xsyr(queue, nullptr).SetUp(); - Xspr(queue, nullptr).SetUp(); Xspr(queue, nullptr).SetUp(); - Xsyr2(queue, nullptr).SetUp(); Xsyr2(queue, nullptr).SetUp(); - Xspr2(queue, nullptr).SetUp(); Xspr2(queue, nullptr).SetUp(); - - // Runs all the level 3 set-up functions - Xgemm(queue, nullptr).SetUp(); Xgemm(queue, nullptr).SetUp(); Xgemm(queue, nullptr).SetUp(); Xgemm(queue, nullptr).SetUp(); - Xsymm(queue, nullptr).SetUp(); Xsymm(queue, nullptr).SetUp(); Xsymm(queue, nullptr).SetUp(); Xsymm(queue, nullptr).SetUp(); - Xhemm(queue, nullptr).SetUp(); Xhemm(queue, nullptr).SetUp(); - Xsyrk(queue, nullptr).SetUp(); Xsyrk(queue, nullptr).SetUp(); Xsyrk(queue, nullptr).SetUp(); Xsyrk(queue, nullptr).SetUp(); - Xherk(queue, nullptr).SetUp(); Xherk(queue, nullptr).SetUp(); - Xsyr2k(queue, nullptr).SetUp(); Xsyr2k(queue, nullptr).SetUp(); Xsyr2k(queue, nullptr).SetUp(); Xsyr2k(queue, nullptr).SetUp(); - Xher2k(queue, nullptr).SetUp(); Xher2k(queue, nullptr).SetUp(); - Xtrmm(queue, nullptr).SetUp(); Xtrmm(queue, nullptr).SetUp(); Xtrmm(queue, nullptr).SetUp(); Xtrmm(queue, nullptr).SetUp(); - - // Runs all the level 3 set-up functions - Xomatcopy(queue, nullptr).SetUp(); Xomatcopy(queue, nullptr).SetUp(); Xomatcopy(queue, nullptr).SetUp(); Xomatcopy(queue, nullptr).SetUp(); - - } catch (...) { return StatusCode::kBuildProgramFailure; } - return StatusCode::kSuccess; -} - -// ================================================================================================= -} // namespace clblast diff --git a/src/clblast.cpp b/src/clblast.cpp new file mode 100644 index 00000000..88d60772 --- /dev/null +++ b/src/clblast.cpp @@ -0,0 +1,2193 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements all the BLAS API calls. In all cases, it does not much more than creating +// a new object of the appropriate type, and calling the main routine on that object. It forwards +// all status codes to the caller. +// +// ================================================================================================= + +#include + +#include "clblast.h" +#include "public_api.hpp" +#include "cache.hpp" + +// BLAS level-1 includes +#include "routines/level1/xswap.hpp" +#include "routines/level1/xscal.hpp" +#include "routines/level1/xcopy.hpp" +#include "routines/level1/xaxpy.hpp" +#include "routines/level1/xdot.hpp" +#include "routines/level1/xdotu.hpp" +#include "routines/level1/xdotc.hpp" +#include "routines/level1/xnrm2.hpp" +#include "routines/level1/xasum.hpp" +#include "routines/level1/xsum.hpp" // non-BLAS routine +#include "routines/level1/xamax.hpp" +#include "routines/level1/xmax.hpp" // non-BLAS routine +#include "routines/level1/xmin.hpp" // non-BLAS routine + +// BLAS level-2 includes +#include "routines/level2/xgemv.hpp" +#include "routines/level2/xgbmv.hpp" +#include "routines/level2/xhemv.hpp" +#include "routines/level2/xhbmv.hpp" +#include "routines/level2/xhpmv.hpp" +#include "routines/level2/xsymv.hpp" +#include "routines/level2/xsbmv.hpp" +#include "routines/level2/xspmv.hpp" +#include "routines/level2/xtrmv.hpp" +#include "routines/level2/xtbmv.hpp" +#include "routines/level2/xtpmv.hpp" +#include "routines/level2/xger.hpp" +#include "routines/level2/xgeru.hpp" +#include "routines/level2/xgerc.hpp" +#include "routines/level2/xher.hpp" +#include "routines/level2/xhpr.hpp" +#include "routines/level2/xher2.hpp" +#include "routines/level2/xhpr2.hpp" +#include "routines/level2/xsyr.hpp" +#include "routines/level2/xspr.hpp" +#include "routines/level2/xsyr2.hpp" +#include "routines/level2/xspr2.hpp" + +// BLAS level-3 includes +#include "routines/level3/xgemm.hpp" +#include "routines/level3/xsymm.hpp" +#include "routines/level3/xhemm.hpp" +#include "routines/level3/xsyrk.hpp" +#include "routines/level3/xherk.hpp" +#include "routines/level3/xsyr2k.hpp" +#include "routines/level3/xher2k.hpp" +#include "routines/level3/xtrmm.hpp" + +// Level-x includes (non-BLAS) +#include "routines/levelx/xomatcopy.hpp" + +namespace clblast { + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Generate givens plane rotation: SROTG/DROTG +template +StatusCode Rotg(cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Generate modified givens plane rotation: SROTMG/DROTMG +template +StatusCode Rotmg(cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Apply givens plane rotation: SROT/DROT +template +StatusCode Rot(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + const T, + const T, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rot(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + const float, + const float, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Rot(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + const double, + const double, + cl_command_queue*, cl_event*); + +// Apply modified givens plane rotation: SROTM/DROTM +template +StatusCode Rotm(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotm(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Rotm(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP +template +StatusCode Swap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xswap(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSwap(n, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL +template +StatusCode Scal(const size_t n, + const T alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xscal(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoScal(n, + alpha, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Scal(const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Scal(const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Scal(const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Scal(const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Scal(const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY +template +StatusCode Copy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xcopy(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoCopy(n, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY +template +StatusCode Axpy(const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xaxpy(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoAxpy(n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Axpy(const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Axpy(const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Axpy(const size_t, + const float2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Axpy(const size_t, + const double2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Axpy(const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Dot product of two vectors: SDOT/DDOT/HDOT +template +StatusCode Dot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xdot(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoDot(n, + Buffer(dot_buffer), dot_offset, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Dot(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Dot(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Dot(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Dot product of two complex vectors: CDOTU/ZDOTU +template +StatusCode Dotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xdotu(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoDotu(n, + Buffer(dot_buffer), dot_offset, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Dotu(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Dotu(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +template +StatusCode Dotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xdotc(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoDotc(n, + Buffer(dot_buffer), dot_offset, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Dotc(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Dotc(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 +template +StatusCode Nrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xnrm2(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoNrm2(n, + Buffer(nrm2_buffer), nrm2_offset, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Nrm2(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Nrm2(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Nrm2(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Nrm2(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Nrm2(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM +template +StatusCode Asum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xasum(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoAsum(n, + Buffer(asum_buffer), asum_offset, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM +template +StatusCode Sum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xsum(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSum(n, + Buffer(sum_buffer), sum_offset, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Sum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +template +StatusCode Amax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xamax(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoAmax(n, + Buffer(imax_buffer), imax_offset, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +template +StatusCode Max(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xmax(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoMax(n, + Buffer(imax_buffer), imax_offset, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Max(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Max(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Max(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Max(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Max(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN +template +StatusCode Min(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xmin(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoMin(n, + Buffer(imin_buffer), imin_offset, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV +template +StatusCode Gemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xgemv(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoGemv(layout, a_transpose, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV +template +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xgbmv(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoGbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +template +StatusCode Hemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xhemv(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHemv(layout, triangle, + n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xhbmv(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHbmv(layout, triangle, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xhpmv(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHpmv(layout, triangle, + n, + alpha, + Buffer(ap_buffer), ap_offset, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV +template +StatusCode Symv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xsymv(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSymv(layout, triangle, + n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Symv(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symv(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symv(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV +template +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xsbmv(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSbmv(layout, triangle, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV +template +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xspmv(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSpmv(layout, triangle, + n, + alpha, + Buffer(ap_buffer), ap_offset, + Buffer(x_buffer), x_offset, x_inc, + beta, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV +template +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xtrmv(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoTrmv(layout, triangle, a_transpose, diagonal, + n, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV +template +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xtbmv(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoTbmv(layout, triangle, a_transpose, diagonal, + n, k, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV +template +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xtpmv(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoTpmv(layout, triangle, a_transpose, diagonal, + n, + Buffer(ap_buffer), ap_offset, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template +StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template +StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template +StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// General rank-1 matrix update: SGER/DGER/HGER +template +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xger(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoGer(layout, + m, n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); +} +template StatusCode PUBLIC_API Ger(const Layout, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Ger(const Layout, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Ger(const Layout, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// General rank-1 complex matrix update: CGERU/ZGERU +template +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xgeru(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoGeru(layout, + m, n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); +} +template StatusCode PUBLIC_API Geru(const Layout, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Geru(const Layout, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xgerc(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoGerc(layout, + m, n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); +} +template StatusCode PUBLIC_API Gerc(const Layout, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gerc(const Layout, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian rank-1 matrix update: CHER/ZHER +template +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xher,T>(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHer(layout, triangle, + n, + alpha, + Buffer>(x_buffer), x_offset, x_inc, + Buffer>(a_buffer), a_offset, a_ld); +} +template StatusCode PUBLIC_API Her(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Her(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xhpr,T>(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHpr(layout, triangle, + n, + alpha, + Buffer>(x_buffer), x_offset, x_inc, + Buffer>(ap_buffer), ap_offset); +} +template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xher2(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHer2(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); +} +template StatusCode PUBLIC_API Her2(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Her2(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xhpr2(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHpr2(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(ap_buffer), ap_offset); +} +template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR +template +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSyr(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(a_buffer), a_offset, a_ld); +} +template StatusCode PUBLIC_API Syr(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR +template +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xspr(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSpr(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(ap_buffer), ap_offset); +} +template StatusCode PUBLIC_API Spr(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spr(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spr(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 +template +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr2(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSyr2(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(a_buffer), a_offset, a_ld); +} +template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 +template +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xspr2(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSpr2(layout, triangle, + n, + alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc, + Buffer(ap_buffer), ap_offset); +} +template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM +template +StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xgemm(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoGemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); +} +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM +template +StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xsymm(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSymm(layout, side, triangle, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); +} +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM +template +StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xhemm(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHemm(layout, side, triangle, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); +} +template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK +template +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xsyrk(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSyrk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); +} +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Rank-K update of a hermitian matrix: CHERK/ZHERK +template +StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xherk,T>(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHerk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer>(a_buffer), a_offset, a_ld, + beta, + Buffer>(c_buffer), c_offset, c_ld); +} +template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K +template +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xsyr2k(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); +} +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K +template +StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xher2k(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHer2k(layout, triangle, ab_transpose, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); +} +template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM +template +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xtrmm(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld); +} +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM +template +StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +template +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xomatcopy(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoOmatcopy(layout, a_transpose, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld); +} +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// ================================================================================================= + +// Clears the cache of stored binaries +StatusCode ClearCache() { return CacheClearAll(); } + +// Fills the cache with all binaries for a specific device +// TODO: Add half-precision FP16 set-up calls +StatusCode FillCache(const cl_device_id device) { + try { + + // Creates a sample context and queue to match the normal routine calling conventions + auto device_cpp = Device(device); + auto context = Context(device_cpp); + auto queue = Queue(context, device_cpp); + + // Runs all the level 1 set-up functions + Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); + Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); + Xscal(queue, nullptr).SetUp(); Xscal(queue, nullptr).SetUp(); Xscal(queue, nullptr).SetUp(); Xscal(queue, nullptr).SetUp(); + Xcopy(queue, nullptr).SetUp(); Xcopy(queue, nullptr).SetUp(); Xcopy(queue, nullptr).SetUp(); Xcopy(queue, nullptr).SetUp(); + Xaxpy(queue, nullptr).SetUp(); Xaxpy(queue, nullptr).SetUp(); Xaxpy(queue, nullptr).SetUp(); Xaxpy(queue, nullptr).SetUp(); + Xdot(queue, nullptr).SetUp(); Xdot(queue, nullptr).SetUp(); + Xdotu(queue, nullptr).SetUp(); Xdotu(queue, nullptr).SetUp(); + Xdotc(queue, nullptr).SetUp(); Xdotc(queue, nullptr).SetUp(); + Xnrm2(queue, nullptr).SetUp(); Xnrm2(queue, nullptr).SetUp(); Xnrm2(queue, nullptr).SetUp(); Xnrm2(queue, nullptr).SetUp(); + Xasum(queue, nullptr).SetUp(); Xasum(queue, nullptr).SetUp(); Xasum(queue, nullptr).SetUp(); Xasum(queue, nullptr).SetUp(); + Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); + Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); + Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); + Xmin(queue, nullptr).SetUp(); Xmin(queue, nullptr).SetUp(); Xmin(queue, nullptr).SetUp(); Xmin(queue, nullptr).SetUp(); + + // Runs all the level 2 set-up functions + Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); + Xgbmv(queue, nullptr).SetUp(); Xgbmv(queue, nullptr).SetUp(); Xgbmv(queue, nullptr).SetUp(); Xgbmv(queue, nullptr).SetUp(); + Xhemv(queue, nullptr).SetUp(); Xhemv(queue, nullptr).SetUp(); + Xhbmv(queue, nullptr).SetUp(); Xhbmv(queue, nullptr).SetUp(); + Xhpmv(queue, nullptr).SetUp(); Xhpmv(queue, nullptr).SetUp(); + Xsymv(queue, nullptr).SetUp(); Xsymv(queue, nullptr).SetUp(); + Xsbmv(queue, nullptr).SetUp(); Xsbmv(queue, nullptr).SetUp(); + Xspmv(queue, nullptr).SetUp(); Xspmv(queue, nullptr).SetUp(); + Xtrmv(queue, nullptr).SetUp(); Xtrmv(queue, nullptr).SetUp(); Xtrmv(queue, nullptr).SetUp(); Xtrmv(queue, nullptr).SetUp(); + Xtbmv(queue, nullptr).SetUp(); Xtbmv(queue, nullptr).SetUp(); Xtbmv(queue, nullptr).SetUp(); Xtbmv(queue, nullptr).SetUp(); + Xtpmv(queue, nullptr).SetUp(); Xtpmv(queue, nullptr).SetUp(); Xtpmv(queue, nullptr).SetUp(); Xtpmv(queue, nullptr).SetUp(); + Xger(queue, nullptr).SetUp(); Xger(queue, nullptr).SetUp(); + Xgeru(queue, nullptr).SetUp(); Xgeru(queue, nullptr).SetUp(); + Xgerc(queue, nullptr).SetUp(); Xgerc(queue, nullptr).SetUp(); + Xher(queue, nullptr).SetUp(); Xher(queue, nullptr).SetUp(); + Xhpr(queue, nullptr).SetUp(); Xhpr(queue, nullptr).SetUp(); + Xher2(queue, nullptr).SetUp(); Xher2(queue, nullptr).SetUp(); + Xhpr2(queue, nullptr).SetUp(); Xhpr2(queue, nullptr).SetUp(); + Xsyr(queue, nullptr).SetUp(); Xsyr(queue, nullptr).SetUp(); + Xspr(queue, nullptr).SetUp(); Xspr(queue, nullptr).SetUp(); + Xsyr2(queue, nullptr).SetUp(); Xsyr2(queue, nullptr).SetUp(); + Xspr2(queue, nullptr).SetUp(); Xspr2(queue, nullptr).SetUp(); + + // Runs all the level 3 set-up functions + Xgemm(queue, nullptr).SetUp(); Xgemm(queue, nullptr).SetUp(); Xgemm(queue, nullptr).SetUp(); Xgemm(queue, nullptr).SetUp(); + Xsymm(queue, nullptr).SetUp(); Xsymm(queue, nullptr).SetUp(); Xsymm(queue, nullptr).SetUp(); Xsymm(queue, nullptr).SetUp(); + Xhemm(queue, nullptr).SetUp(); Xhemm(queue, nullptr).SetUp(); + Xsyrk(queue, nullptr).SetUp(); Xsyrk(queue, nullptr).SetUp(); Xsyrk(queue, nullptr).SetUp(); Xsyrk(queue, nullptr).SetUp(); + Xherk(queue, nullptr).SetUp(); Xherk(queue, nullptr).SetUp(); + Xsyr2k(queue, nullptr).SetUp(); Xsyr2k(queue, nullptr).SetUp(); Xsyr2k(queue, nullptr).SetUp(); Xsyr2k(queue, nullptr).SetUp(); + Xher2k(queue, nullptr).SetUp(); Xher2k(queue, nullptr).SetUp(); + Xtrmm(queue, nullptr).SetUp(); Xtrmm(queue, nullptr).SetUp(); Xtrmm(queue, nullptr).SetUp(); Xtrmm(queue, nullptr).SetUp(); + + // Runs all the level 3 set-up functions + Xomatcopy(queue, nullptr).SetUp(); Xomatcopy(queue, nullptr).SetUp(); Xomatcopy(queue, nullptr).SetUp(); Xomatcopy(queue, nullptr).SetUp(); + + } catch (...) { return StatusCode::kBuildProgramFailure; } + return StatusCode::kSuccess; +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/clblast_c.cc b/src/clblast_c.cc deleted file mode 100644 index 9ea2c884..00000000 --- a/src/clblast_c.cc +++ /dev/null @@ -1,2927 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements all the plain C BLAS API calls. This forwards the calls to the C++ API. -// -// ================================================================================================= - -#include - -#include "clblast_c.h" -#include "clblast.h" -#include "utilities.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// ================================================================================================= -// BLAS level-1 (vector-vector) routines -// ================================================================================================= - -// ROTG -StatusCode CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotg(sa_buffer, sa_offset, - sb_buffer, sb_offset, - sc_buffer, sc_offset, - ss_buffer, ss_offset, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotg(sa_buffer, sa_offset, - sb_buffer, sb_offset, - sc_buffer, sc_offset, - ss_buffer, ss_offset, - queue, event); - return static_cast(status); -} - -// ROTMG -StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotmg(sd1_buffer, sd1_offset, - sd2_buffer, sd2_offset, - sx1_buffer, sx1_offset, - sy1_buffer, sy1_offset, - sparam_buffer, sparam_offset, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotmg(sd1_buffer, sd1_offset, - sd2_buffer, sd2_offset, - sx1_buffer, sx1_offset, - sy1_buffer, sy1_offset, - sparam_buffer, sparam_offset, - queue, event); - return static_cast(status); -} - -// ROT -StatusCode CLBlastSrot(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const float cos, - const float sin, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rot(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - cos, - sin, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDrot(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const double cos, - const double sin, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rot(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - cos, - sin, - queue, event); - return static_cast(status); -} - -// ROTM -StatusCode CLBlastSrotm(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotm(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - sparam_buffer, sparam_offset, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDrotm(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotm(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - sparam_buffer, sparam_offset, - queue, event); - return static_cast(status); -} - -// SWAP -StatusCode CLBlastSswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Swap(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Swap(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Swap(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Swap(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Swap(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// SCAL -StatusCode CLBlastSscal(const size_t n, - const float alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Scal(n, - alpha, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDscal(const size_t n, - const double alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Scal(n, - alpha, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCscal(const size_t n, - const cl_float2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Scal(n, - float2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZscal(const size_t n, - const cl_double2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Scal(n, - double2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHscal(const size_t n, - const cl_half alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Scal(n, - alpha, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// COPY -StatusCode CLBlastScopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Copy(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Copy(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Copy(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Copy(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Copy(n, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// AXPY -StatusCode CLBlastSaxpy(const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDaxpy(const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCaxpy(const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - float2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZaxpy(const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - double2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHaxpy(const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Axpy(n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// DOT -StatusCode CLBlastSdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dot(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dot(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dot(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// DOTU -StatusCode CLBlastCdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dotu(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dotu(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// DOTC -StatusCode CLBlastCdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dotc(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Dotc(n, - dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// NRM2 -StatusCode CLBlastSnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Nrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Nrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastScnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Nrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDznrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Nrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Nrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// ASUM -StatusCode CLBlastSasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Asum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Asum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastScasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Asum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDzasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Asum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Asum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// SUM -StatusCode CLBlastSsum(const size_t n, - cl_mem sum_buffer, const size_t sum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sum(n, - sum_buffer, sum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDsum(const size_t n, - cl_mem sum_buffer, const size_t sum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sum(n, - sum_buffer, sum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastScsum(const size_t n, - cl_mem sum_buffer, const size_t sum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sum(n, - sum_buffer, sum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDzsum(const size_t n, - cl_mem sum_buffer, const size_t sum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sum(n, - sum_buffer, sum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHsum(const size_t n, - cl_mem sum_buffer, const size_t sum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sum(n, - sum_buffer, sum_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// AMAX -StatusCode CLBlastiSamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Amax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastiDamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Amax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastiCamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Amax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastiZamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Amax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastiHamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Amax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// MAX -StatusCode CLBlastiSmax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Max(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastiDmax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Max(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastiCmax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Max(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastiZmax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Max(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastiHmax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Max(n, - imax_buffer, imax_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// MIN -StatusCode CLBlastiSmin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Min(n, - imin_buffer, imin_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastiDmin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Min(n, - imin_buffer, imin_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastiCmin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Min(n, - imin_buffer, imin_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastiZmin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Min(n, - imin_buffer, imin_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastiHmin(const size_t n, - cl_mem imin_buffer, const size_t imin_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Min(n, - imin_buffer, imin_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// ================================================================================================= -// BLAS level-2 (matrix-vector) routines -// ================================================================================================= - -// GEMV -StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - float2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - double2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// GBMV -StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - float2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - double2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// HEMV -StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hemv(static_cast(layout), - static_cast(triangle), - n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - float2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hemv(static_cast(layout), - static_cast(triangle), - n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - double2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// HBMV -StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hbmv(static_cast(layout), - static_cast(triangle), - n, k, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - float2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hbmv(static_cast(layout), - static_cast(triangle), - n, k, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - double2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// HPMV -StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hpmv(static_cast(layout), - static_cast(triangle), - n, - float2{alpha.s[0], alpha.s[1]}, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - float2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hpmv(static_cast(layout), - static_cast(triangle), - n, - double2{alpha.s[0], alpha.s[1]}, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - double2{beta.s[0], beta.s[1]}, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// SYMV -StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symv(static_cast(layout), - static_cast(triangle), - n, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symv(static_cast(layout), - static_cast(triangle), - n, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symv(static_cast(layout), - static_cast(triangle), - n, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// SBMV -StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Sbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// SPMV -StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spmv(static_cast(layout), - static_cast(triangle), - n, - alpha, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spmv(static_cast(layout), - static_cast(triangle), - n, - alpha, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_half alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spmv(static_cast(layout), - static_cast(triangle), - n, - alpha, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - beta, - y_buffer, y_offset, y_inc, - queue, event); - return static_cast(status); -} - -// TRMV -StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// TBMV -StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// TPMV -StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// TRSV -StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// TBSV -StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// TPSV -StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer, ap_offset, - x_buffer, x_offset, x_inc, - queue, event); - return static_cast(status); -} - -// GER -StatusCode CLBlastSger(const Layout layout, - const size_t m, const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Ger(static_cast(layout), - m, n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDger(const Layout layout, - const size_t m, const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Ger(static_cast(layout), - m, n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHger(const Layout layout, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Ger(static_cast(layout), - m, n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} - -// GERU -StatusCode CLBlastCgeru(const Layout layout, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Geru(static_cast(layout), - m, n, - float2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZgeru(const Layout layout, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Geru(static_cast(layout), - m, n, - double2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} - -// GERC -StatusCode CLBlastCgerc(const Layout layout, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gerc(static_cast(layout), - m, n, - float2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZgerc(const Layout layout, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gerc(static_cast(layout), - m, n, - double2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} - -// HER -StatusCode CLBlastCher(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Her(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZher(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Her(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} - -// HPR -StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hpr(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hpr(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast(status); -} - -// HER2 -StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Her2(static_cast(layout), - static_cast(triangle), - n, - float2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Her2(static_cast(layout), - static_cast(triangle), - n, - double2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} - -// HPR2 -StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hpr2(static_cast(layout), - static_cast(triangle), - n, - float2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hpr2(static_cast(layout), - static_cast(triangle), - n, - double2{alpha.s[0], alpha.s[1]}, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast(status); -} - -// SYR -StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle, - const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} - -// SPR -StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spr(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spr(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHspr(const Layout layout, const Triangle triangle, - const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spr(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast(status); -} - -// SYR2 -StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld, - queue, event); - return static_cast(status); -} - -// SPR2 -StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spr2(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spr2(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Spr2(static_cast(layout), - static_cast(triangle), - n, - alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, - queue, event); - return static_cast(status); -} - -// ================================================================================================= -// BLAS level-3 (matrix-matrix) routines -// ================================================================================================= - -// GEMM -StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - float2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - double2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} - -// SYMM -StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - float2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - double2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} - -// HEMM -StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hemm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - float2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Hemm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - double2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} - -// SYRK -StatusCode CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - float2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - double2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} - -// HERK -StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Herk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Herk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} - -// SYR2K -StatusCode CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - float2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - double2{beta.s[0], beta.s[1]}, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} - -// HER2K -StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Her2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Her2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld, - queue, event); - return static_cast(status); -} - -// TRMM -StatusCode CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} - -// TRSM -StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} - -// ================================================================================================= -// Extra non-BLAS routines (level-X) -// ================================================================================================= - -// OMATCOPY -StatusCode CLBlastSomatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - float2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - double2{alpha.s[0], alpha.s[1]}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} -StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto status = clblast::Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event); - return static_cast(status); -} - -// ================================================================================================= - -// Clears the cache of stored binaries -StatusCode CLBlastClearCache() { - return static_cast(clblast::ClearCache()); -} - -// Fills the cache with binaries for a specific device -StatusCode CLBlastFillCache(const cl_device_id device) { - return static_cast(clblast::FillCache(device)); -} - -// ================================================================================================= diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp new file mode 100644 index 00000000..9ea2c884 --- /dev/null +++ b/src/clblast_c.cpp @@ -0,0 +1,2927 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements all the plain C BLAS API calls. This forwards the calls to the C++ API. +// +// ================================================================================================= + +#include + +#include "clblast_c.h" +#include "clblast.h" +#include "utilities.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// ROTG +StatusCode CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset, + cl_mem sb_buffer, const size_t sb_offset, + cl_mem sc_buffer, const size_t sc_offset, + cl_mem ss_buffer, const size_t ss_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rotg(sa_buffer, sa_offset, + sb_buffer, sb_offset, + sc_buffer, sc_offset, + ss_buffer, ss_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, + cl_mem sb_buffer, const size_t sb_offset, + cl_mem sc_buffer, const size_t sc_offset, + cl_mem ss_buffer, const size_t ss_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rotg(sa_buffer, sa_offset, + sb_buffer, sb_offset, + sc_buffer, sc_offset, + ss_buffer, ss_offset, + queue, event); + return static_cast(status); +} + +// ROTMG +StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, + cl_mem sd2_buffer, const size_t sd2_offset, + cl_mem sx1_buffer, const size_t sx1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rotmg(sd1_buffer, sd1_offset, + sd2_buffer, sd2_offset, + sx1_buffer, sx1_offset, + sy1_buffer, sy1_offset, + sparam_buffer, sparam_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, + cl_mem sd2_buffer, const size_t sd2_offset, + cl_mem sx1_buffer, const size_t sx1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rotmg(sd1_buffer, sd1_offset, + sd2_buffer, sd2_offset, + sx1_buffer, sx1_offset, + sy1_buffer, sy1_offset, + sparam_buffer, sparam_offset, + queue, event); + return static_cast(status); +} + +// ROT +StatusCode CLBlastSrot(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const float cos, + const float sin, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rot(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + cos, + sin, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDrot(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const double cos, + const double sin, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rot(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + cos, + sin, + queue, event); + return static_cast(status); +} + +// ROTM +StatusCode CLBlastSrotm(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rotm(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + sparam_buffer, sparam_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDrotm(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rotm(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + sparam_buffer, sparam_offset, + queue, event); + return static_cast(status); +} + +// SWAP +StatusCode CLBlastSswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// SCAL +StatusCode CLBlastSscal(const size_t n, + const float alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + alpha, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDscal(const size_t n, + const double alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + alpha, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCscal(const size_t n, + const cl_float2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZscal(const size_t n, + const cl_double2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHscal(const size_t n, + const cl_half alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + alpha, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// COPY +StatusCode CLBlastScopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// AXPY +StatusCode CLBlastSaxpy(const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Axpy(n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDaxpy(const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Axpy(n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCaxpy(const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Axpy(n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZaxpy(const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Axpy(n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHaxpy(const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Axpy(n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// DOT +StatusCode CLBlastSdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dot(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dot(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dot(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// DOTU +StatusCode CLBlastCdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dotu(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dotu(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// DOTC +StatusCode CLBlastCdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dotc(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dotc(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// NRM2 +StatusCode CLBlastSnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Nrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Nrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastScnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Nrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDznrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Nrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Nrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// ASUM +StatusCode CLBlastSasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastScasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDzasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// SUM +StatusCode CLBlastSsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sum(n, + sum_buffer, sum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sum(n, + sum_buffer, sum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastScsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sum(n, + sum_buffer, sum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDzsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sum(n, + sum_buffer, sum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sum(n, + sum_buffer, sum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// AMAX +StatusCode CLBlastiSamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiDamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiCamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiZamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiHamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// MAX +StatusCode CLBlastiSmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Max(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiDmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Max(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiCmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Max(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiZmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Max(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiHmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Max(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// MIN +StatusCode CLBlastiSmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiDmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiCmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiZmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiHmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// GEMV +StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// GBMV +StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// HEMV +StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hemv(static_cast(layout), + static_cast(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hemv(static_cast(layout), + static_cast(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// HBMV +StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// HPMV +StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpmv(static_cast(layout), + static_cast(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpmv(static_cast(layout), + static_cast(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// SYMV +StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Symv(static_cast(layout), + static_cast(triangle), + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Symv(static_cast(layout), + static_cast(triangle), + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Symv(static_cast(layout), + static_cast(triangle), + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// SBMV +StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// SPMV +StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// TRMV +StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TBMV +StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TPMV +StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TRSV +StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TBSV +StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TPSV +StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// GER +StatusCode CLBlastSger(const Layout layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Ger(static_cast(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDger(const Layout layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Ger(static_cast(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHger(const Layout layout, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Ger(static_cast(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// GERU +StatusCode CLBlastCgeru(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Geru(static_cast(layout), + m, n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZgeru(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Geru(static_cast(layout), + m, n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// GERC +StatusCode CLBlastCgerc(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gerc(static_cast(layout), + m, n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZgerc(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gerc(static_cast(layout), + m, n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// HER +StatusCode CLBlastCher(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZher(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// HPR +StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} + +// HER2 +StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her2(static_cast(layout), + static_cast(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her2(static_cast(layout), + static_cast(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// HPR2 +StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr2(static_cast(layout), + static_cast(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr2(static_cast(layout), + static_cast(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} + +// SYR +StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// SPR +StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHspr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} + +// SYR2 +StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// SPR2 +StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// GEMM +StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + float2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + double2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} + +// SYMM +StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + float2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + double2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} + +// HEMM +StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + float2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + double2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} + +// SYRK +StatusCode CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + float2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + double2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} + +// HERK +StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} + +// SYR2K +StatusCode CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + float2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + double2{beta.s[0], beta.s[1]}, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} + +// HER2K +StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} + +// TRMM +StatusCode CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} + +// TRSM +StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// OMATCOPY +StatusCode CLBlastSomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} + +// ================================================================================================= + +// Clears the cache of stored binaries +StatusCode CLBlastClearCache() { + return static_cast(clblast::ClearCache()); +} + +// Fills the cache with binaries for a specific device +StatusCode CLBlastFillCache(const cl_device_id device) { + return static_cast(clblast::FillCache(device)); +} + +// ================================================================================================= diff --git a/src/database/database.cc b/src/database/database.cc deleted file mode 100644 index 6ec93731..00000000 --- a/src/database/database.cc +++ /dev/null @@ -1,120 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Database class (see the header for information about the class). -// -// ================================================================================================= - -#include "utilities.hpp" - -#include "database/database.hpp" -#include "database/kernels/xaxpy.hpp" -#include "database/kernels/xdot.hpp" -#include "database/kernels/xgemv.hpp" -#include "database/kernels/xger.hpp" -#include "database/kernels/xgemm.hpp" -#include "database/kernels/copy.hpp" -#include "database/kernels/pad.hpp" -#include "database/kernels/transpose.hpp" -#include "database/kernels/padtranspose.hpp" - -namespace clblast { -// ================================================================================================= - -// Initializes the database -const std::vector Database::database = { - XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble, - XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble, - XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble, - XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble, - XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, - CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, - PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble, - TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble, - PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble -}; - -// ================================================================================================= - -// Constructor, computing device properties and populating the parameter-vector from the database -Database::Database(const Queue &queue, const std::vector &kernels, - const Precision precision): - parameters_{} { - - // Finds information of the current device - auto device = queue.GetDevice(); - auto device_type = device.Type(); - auto device_vendor = device.Vendor(); - auto device_name = device.Name(); - - // Iterates over all kernels to include, and retrieves the parameters for each of them - for (auto &kernel: kernels) { - auto search_result = Search(kernel, device_type, device_vendor, device_name, precision); - parameters_.insert(search_result.begin(), search_result.end()); - } -} - -// ================================================================================================= - -// Returns a list of OpenCL pre-processor defines in string form -std::string Database::GetDefines() const { - std::string defines{}; - for (auto ¶meter: parameters_) { - defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n"; - } - return defines; -} - -// ================================================================================================= - -// Searches the database for the right kernel and precision -Database::Parameters Database::Search(const std::string &this_kernel, - const std::string &this_type, - const std::string &this_vendor, - const std::string &this_device, - const Precision this_precision) const { - // Set the short vendor name - auto this_short_vendor = this_vendor; - for (auto &combination : kVendorNames) { - if (this_vendor == combination.first) { - this_short_vendor = combination.second; - } - } - - // Selects the right kernel - for (auto &db: database) { - if (db.kernel == this_kernel && db.precision == this_precision) { - - // Searches for the right vendor and device type, or selects the default if unavailable. This - // assumes that the default vendor / device type is last in the database. - for (auto &vendor: db.vendors) { - if ((vendor.name == this_short_vendor || vendor.name == kDeviceVendorAll) && - (vendor.type == this_type || vendor.type == kDeviceTypeAll)) { - - // Searches for the right device. If the current device is unavailable, selects the vendor - // default parameters. This assumes the default is last in the database. - for (auto &device: vendor.devices) { - - if (device.name == this_device || device.name == "default") { - - // Sets the parameters accordingly - return device.parameters; - } - } - } - } - } - } - - // If we reached this point, something is wrong - throw std::runtime_error("Database error, could not find a suitable entry"); -} - -// ================================================================================================= -} // namespace clblast diff --git a/src/database/database.cpp b/src/database/database.cpp new file mode 100644 index 00000000..6ec93731 --- /dev/null +++ b/src/database/database.cpp @@ -0,0 +1,120 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Database class (see the header for information about the class). +// +// ================================================================================================= + +#include "utilities.hpp" + +#include "database/database.hpp" +#include "database/kernels/xaxpy.hpp" +#include "database/kernels/xdot.hpp" +#include "database/kernels/xgemv.hpp" +#include "database/kernels/xger.hpp" +#include "database/kernels/xgemm.hpp" +#include "database/kernels/copy.hpp" +#include "database/kernels/pad.hpp" +#include "database/kernels/transpose.hpp" +#include "database/kernels/padtranspose.hpp" + +namespace clblast { +// ================================================================================================= + +// Initializes the database +const std::vector Database::database = { + XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble, + XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble, + XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble, + XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble, + XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, + CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, + PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble, + TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble, + PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble +}; + +// ================================================================================================= + +// Constructor, computing device properties and populating the parameter-vector from the database +Database::Database(const Queue &queue, const std::vector &kernels, + const Precision precision): + parameters_{} { + + // Finds information of the current device + auto device = queue.GetDevice(); + auto device_type = device.Type(); + auto device_vendor = device.Vendor(); + auto device_name = device.Name(); + + // Iterates over all kernels to include, and retrieves the parameters for each of them + for (auto &kernel: kernels) { + auto search_result = Search(kernel, device_type, device_vendor, device_name, precision); + parameters_.insert(search_result.begin(), search_result.end()); + } +} + +// ================================================================================================= + +// Returns a list of OpenCL pre-processor defines in string form +std::string Database::GetDefines() const { + std::string defines{}; + for (auto ¶meter: parameters_) { + defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n"; + } + return defines; +} + +// ================================================================================================= + +// Searches the database for the right kernel and precision +Database::Parameters Database::Search(const std::string &this_kernel, + const std::string &this_type, + const std::string &this_vendor, + const std::string &this_device, + const Precision this_precision) const { + // Set the short vendor name + auto this_short_vendor = this_vendor; + for (auto &combination : kVendorNames) { + if (this_vendor == combination.first) { + this_short_vendor = combination.second; + } + } + + // Selects the right kernel + for (auto &db: database) { + if (db.kernel == this_kernel && db.precision == this_precision) { + + // Searches for the right vendor and device type, or selects the default if unavailable. This + // assumes that the default vendor / device type is last in the database. + for (auto &vendor: db.vendors) { + if ((vendor.name == this_short_vendor || vendor.name == kDeviceVendorAll) && + (vendor.type == this_type || vendor.type == kDeviceTypeAll)) { + + // Searches for the right device. If the current device is unavailable, selects the vendor + // default parameters. This assumes the default is last in the database. + for (auto &device: vendor.devices) { + + if (device.name == this_device || device.name == "default") { + + // Sets the parameters accordingly + return device.parameters; + } + } + } + } + } + } + + // If we reached this point, something is wrong + throw std::runtime_error("Database error, could not find a suitable entry"); +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/routine.cc b/src/routine.cc deleted file mode 100644 index d3590896..00000000 --- a/src/routine.cc +++ /dev/null @@ -1,131 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Routine base class (see the header for information about the class). -// -// ================================================================================================= - -#include -#include - -#include "routine.hpp" - -namespace clblast { -// ================================================================================================= - -// Constructor: not much here, because no status codes can be returned -Routine::Routine(Queue &queue, EventPointer event, const std::string &name, - const std::vector &routines, const Precision precision): - precision_(precision), - routine_name_(name), - queue_(queue), - event_(event), - context_(queue_.GetContext()), - device_(queue_.GetDevice()), - device_name_(device_.Name()), - db_(queue_, routines, precision_) { -} - -// ================================================================================================= - -// Separate set-up function to allow for status codes to be returned -StatusCode Routine::SetUp() { - - // Queries the cache to see whether or not the program (context-specific) is already there - if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; } - - // Queries the cache to see whether or not the binary (device-specific) is already there. If it - // is, a program is created and stored in the cache - if (BinaryIsInCache(device_name_, precision_, routine_name_)) { - try { - auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_); - auto program = Program(device_, context_, binary); - auto options = std::vector(); - program.Build(device_, options); - StoreProgramToCache(program, context_, precision_, routine_name_); - } catch (...) { return StatusCode::kBuildProgramFailure; } - return StatusCode::kSuccess; - } - - // Otherwise, the kernel will be compiled and program will be built. Both the binary and the - // program will be added to the cache. - - // Inspects whether or not cl_khr_fp64 is supported in case of double precision - const auto extensions = device_.Capabilities(); - if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) { - if (extensions.find(kKhronosDoublePrecision) == std::string::npos) { - return StatusCode::kNoDoublePrecision; - } - } - - // As above, but for cl_khr_fp16 (half precision) - if (precision_ == Precision::kHalf) { - if (extensions.find(kKhronosHalfPrecision) == std::string::npos) { - return StatusCode::kNoHalfPrecision; - } - } - - // Loads the common header (typedefs and defines and such) - std::string common_header = - #include "kernels/common.opencl" - ; - - // Collects the parameters for this device in the form of defines, and adds the precision - auto defines = db_.GetDefines(); - defines += "#define PRECISION "+ToString(static_cast(precision_))+"\n"; - - // Adds the name of the routine as a define - defines += "#define ROUTINE_"+routine_name_+"\n"; - - // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve - // performance, but might result in a reduced accuracy. - if (device_.IsAMD() && device_.IsGPU()) { - defines += "#define USE_CL_MAD 1\n"; - } - - // For specific devices, use staggered/shuffled workgroup indices. - if (device_.IsAMD() && device_.IsGPU()) { - defines += "#define USE_STAGGERED_INDICES 1\n"; - } - - // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize - // performance through better cache behaviour - if (device_.IsARM() && device_.IsGPU()) { - defines += "#define GLOBAL_MEM_FENCE 1\n"; - } - - // Combines everything together into a single source string - const auto source_string = defines + common_header + source_string_; - - // Compiles the kernel - try { - auto program = Program(context_, source_string); - auto options = std::vector(); - const auto build_status = program.Build(device_, options); - - // Checks for compiler crashes/errors/warnings - if (build_status == BuildStatus::kError) { - const auto message = program.GetBuildInfo(device_); - fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str()); - return StatusCode::kBuildProgramFailure; - } - if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; } - - // Store the compiled binary and program in the cache - const auto binary = program.GetIR(); - StoreBinaryToCache(binary, device_name_, precision_, routine_name_); - StoreProgramToCache(program, context_, precision_, routine_name_); - } catch (...) { return StatusCode::kBuildProgramFailure; } - - // No errors, normal termination of this function - return StatusCode::kSuccess; -} - -// ================================================================================================= -} // namespace clblast diff --git a/src/routine.cpp b/src/routine.cpp new file mode 100644 index 00000000..d3590896 --- /dev/null +++ b/src/routine.cpp @@ -0,0 +1,131 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Routine base class (see the header for information about the class). +// +// ================================================================================================= + +#include +#include + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// Constructor: not much here, because no status codes can be returned +Routine::Routine(Queue &queue, EventPointer event, const std::string &name, + const std::vector &routines, const Precision precision): + precision_(precision), + routine_name_(name), + queue_(queue), + event_(event), + context_(queue_.GetContext()), + device_(queue_.GetDevice()), + device_name_(device_.Name()), + db_(queue_, routines, precision_) { +} + +// ================================================================================================= + +// Separate set-up function to allow for status codes to be returned +StatusCode Routine::SetUp() { + + // Queries the cache to see whether or not the program (context-specific) is already there + if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; } + + // Queries the cache to see whether or not the binary (device-specific) is already there. If it + // is, a program is created and stored in the cache + if (BinaryIsInCache(device_name_, precision_, routine_name_)) { + try { + auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_); + auto program = Program(device_, context_, binary); + auto options = std::vector(); + program.Build(device_, options); + StoreProgramToCache(program, context_, precision_, routine_name_); + } catch (...) { return StatusCode::kBuildProgramFailure; } + return StatusCode::kSuccess; + } + + // Otherwise, the kernel will be compiled and program will be built. Both the binary and the + // program will be added to the cache. + + // Inspects whether or not cl_khr_fp64 is supported in case of double precision + const auto extensions = device_.Capabilities(); + if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) { + if (extensions.find(kKhronosDoublePrecision) == std::string::npos) { + return StatusCode::kNoDoublePrecision; + } + } + + // As above, but for cl_khr_fp16 (half precision) + if (precision_ == Precision::kHalf) { + if (extensions.find(kKhronosHalfPrecision) == std::string::npos) { + return StatusCode::kNoHalfPrecision; + } + } + + // Loads the common header (typedefs and defines and such) + std::string common_header = + #include "kernels/common.opencl" + ; + + // Collects the parameters for this device in the form of defines, and adds the precision + auto defines = db_.GetDefines(); + defines += "#define PRECISION "+ToString(static_cast(precision_))+"\n"; + + // Adds the name of the routine as a define + defines += "#define ROUTINE_"+routine_name_+"\n"; + + // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve + // performance, but might result in a reduced accuracy. + if (device_.IsAMD() && device_.IsGPU()) { + defines += "#define USE_CL_MAD 1\n"; + } + + // For specific devices, use staggered/shuffled workgroup indices. + if (device_.IsAMD() && device_.IsGPU()) { + defines += "#define USE_STAGGERED_INDICES 1\n"; + } + + // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize + // performance through better cache behaviour + if (device_.IsARM() && device_.IsGPU()) { + defines += "#define GLOBAL_MEM_FENCE 1\n"; + } + + // Combines everything together into a single source string + const auto source_string = defines + common_header + source_string_; + + // Compiles the kernel + try { + auto program = Program(context_, source_string); + auto options = std::vector(); + const auto build_status = program.Build(device_, options); + + // Checks for compiler crashes/errors/warnings + if (build_status == BuildStatus::kError) { + const auto message = program.GetBuildInfo(device_); + fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str()); + return StatusCode::kBuildProgramFailure; + } + if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; } + + // Store the compiled binary and program in the cache + const auto binary = program.GetIR(); + StoreBinaryToCache(binary, device_name_, precision_, routine_name_); + StoreProgramToCache(program, context_, precision_, routine_name_); + } catch (...) { return StatusCode::kBuildProgramFailure; } + + // No errors, normal termination of this function + return StatusCode::kSuccess; +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/common.cc b/src/routines/common.cc deleted file mode 100644 index c378df28..00000000 --- a/src/routines/common.cc +++ /dev/null @@ -1,65 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the common routine functions (see the header for more information). -// -// ================================================================================================= - -#include - -#include "routines/common.hpp" - -namespace clblast { -// ================================================================================================= - -// Enqueues a kernel, waits for completion, and checks for errors -StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, - std::vector global, const std::vector &local, - EventPointer event, std::vector& waitForEvents) { - - // Tests for validity of the local thread sizes - if (local.size() > device.MaxWorkItemDimensions()) { - return StatusCode::kInvalidLocalNumDimensions; - } - const auto max_work_item_sizes = device.MaxWorkItemSizes(); - for (auto i=size_t{0}; i max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; } - } - auto local_size = size_t{1}; - for (auto &item: local) { local_size *= item; } - if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; } - - // Make sure the global thread sizes are at least equal to the local sizes - for (auto i=size_t{0}; i global, const std::vector &local, - EventPointer event) { - auto emptyWaitingList = std::vector(); - return RunKernel(kernel, queue, device, global, local, event, emptyWaitingList); -} - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/common.cpp b/src/routines/common.cpp new file mode 100644 index 00000000..c378df28 --- /dev/null +++ b/src/routines/common.cpp @@ -0,0 +1,65 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the common routine functions (see the header for more information). +// +// ================================================================================================= + +#include + +#include "routines/common.hpp" + +namespace clblast { +// ================================================================================================= + +// Enqueues a kernel, waits for completion, and checks for errors +StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, + std::vector global, const std::vector &local, + EventPointer event, std::vector& waitForEvents) { + + // Tests for validity of the local thread sizes + if (local.size() > device.MaxWorkItemDimensions()) { + return StatusCode::kInvalidLocalNumDimensions; + } + const auto max_work_item_sizes = device.MaxWorkItemSizes(); + for (auto i=size_t{0}; i max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; } + } + auto local_size = size_t{1}; + for (auto &item: local) { local_size *= item; } + if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; } + + // Make sure the global thread sizes are at least equal to the local sizes + for (auto i=size_t{0}; i global, const std::vector &local, + EventPointer event) { + auto emptyWaitingList = std::vector(); + return RunKernel(kernel, queue, device, global, local, event, emptyWaitingList); +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc deleted file mode 100644 index 6b6e7f9e..00000000 --- a/src/routines/level1/xamax.cc +++ /dev/null @@ -1,105 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xamax class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level1/xamax.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xamax::Xamax(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xdot"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level1/xamax.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xamax::DoAmax(const size_t n, - const Buffer &imax_buffer, const size_t imax_offset, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { - - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorIndex(1, imax_buffer, imax_offset); - if (ErrorIn(status)) { return status; } - - // Retrieves the Xamax kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel1 = Kernel(program, "Xamax"); - auto kernel2 = Kernel(program, "XamaxEpilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer1 = Buffer(context_, temp_size); - auto temp_buffer2 = Buffer(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast(x_offset)); - kernel1.SetArgument(3, static_cast(x_inc)); - kernel1.SetArgument(4, temp_buffer1()); - kernel1.SetArgument(5, temp_buffer2()); - - // Event waiting list - auto eventWaitList = std::vector(); - - // Launches the main kernel - auto global1 = std::vector{db_["WGS1"]*temp_size}; - auto local1 = std::vector{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer1()); - kernel2.SetArgument(1, temp_buffer2()); - kernel2.SetArgument(2, imax_buffer()); - kernel2.SetArgument(3, static_cast(imax_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector{db_["WGS2"]}; - auto local2 = std::vector{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xamax; -template class Xamax; -template class Xamax; -template class Xamax; -template class Xamax; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level1/xamax.cpp b/src/routines/level1/xamax.cpp new file mode 100644 index 00000000..6b6e7f9e --- /dev/null +++ b/src/routines/level1/xamax.cpp @@ -0,0 +1,105 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xamax class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level1/xamax.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xamax::Xamax(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xdot"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level1/xamax.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xamax::DoAmax(const size_t n, + const Buffer &imax_buffer, const size_t imax_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc); + if (ErrorIn(status)) { return status; } + status = TestVectorIndex(1, imax_buffer, imax_offset); + if (ErrorIn(status)) { return status; } + + // Retrieves the Xamax kernels from the compiled binary + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel1 = Kernel(program, "Xamax"); + auto kernel2 = Kernel(program, "XamaxEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer1 = Buffer(context_, temp_size); + auto temp_buffer2 = Buffer(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast(x_offset)); + kernel1.SetArgument(3, static_cast(x_inc)); + kernel1.SetArgument(4, temp_buffer1()); + kernel1.SetArgument(5, temp_buffer2()); + + // Event waiting list + auto eventWaitList = std::vector(); + + // Launches the main kernel + auto global1 = std::vector{db_["WGS1"]*temp_size}; + auto local1 = std::vector{db_["WGS1"]}; + auto kernelEvent = Event(); + status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer1()); + kernel2.SetArgument(1, temp_buffer2()); + kernel2.SetArgument(2, imax_buffer()); + kernel2.SetArgument(3, static_cast(imax_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector{db_["WGS2"]}; + auto local2 = std::vector{db_["WGS2"]}; + status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xamax; +template class Xamax; +template class Xamax; +template class Xamax; +template class Xamax; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cc deleted file mode 100644 index 0c1ce903..00000000 --- a/src/routines/level1/xasum.cc +++ /dev/null @@ -1,102 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xasum class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level1/xasum.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xasum::Xasum(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xdot"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level1/xasum.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xasum::DoAsum(const size_t n, - const Buffer &asum_buffer, const size_t asum_offset, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { - - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorScalar(1, asum_buffer, asum_offset); - if (ErrorIn(status)) { return status; } - - // Retrieves the Xasum kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel1 = Kernel(program, "Xasum"); - auto kernel2 = Kernel(program, "XasumEpilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer = Buffer(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast(x_offset)); - kernel1.SetArgument(3, static_cast(x_inc)); - kernel1.SetArgument(4, temp_buffer()); - - // Event waiting list - auto eventWaitList = std::vector(); - - // Launches the main kernel - auto global1 = std::vector{db_["WGS1"]*temp_size}; - auto local1 = std::vector{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer()); - kernel2.SetArgument(1, asum_buffer()); - kernel2.SetArgument(2, static_cast(asum_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector{db_["WGS2"]}; - auto local2 = std::vector{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xasum; -template class Xasum; -template class Xasum; -template class Xasum; -template class Xasum; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level1/xasum.cpp b/src/routines/level1/xasum.cpp new file mode 100644 index 00000000..0c1ce903 --- /dev/null +++ b/src/routines/level1/xasum.cpp @@ -0,0 +1,102 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xasum class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level1/xasum.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xasum::Xasum(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xdot"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level1/xasum.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xasum::DoAsum(const size_t n, + const Buffer &asum_buffer, const size_t asum_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc); + if (ErrorIn(status)) { return status; } + status = TestVectorScalar(1, asum_buffer, asum_offset); + if (ErrorIn(status)) { return status; } + + // Retrieves the Xasum kernels from the compiled binary + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel1 = Kernel(program, "Xasum"); + auto kernel2 = Kernel(program, "XasumEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast(x_offset)); + kernel1.SetArgument(3, static_cast(x_inc)); + kernel1.SetArgument(4, temp_buffer()); + + // Event waiting list + auto eventWaitList = std::vector(); + + // Launches the main kernel + auto global1 = std::vector{db_["WGS1"]*temp_size}; + auto local1 = std::vector{db_["WGS1"]}; + auto kernelEvent = Event(); + status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, asum_buffer()); + kernel2.SetArgument(2, static_cast(asum_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector{db_["WGS2"]}; + auto local2 = std::vector{db_["WGS2"]}; + status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xasum; +template class Xasum; +template class Xasum; +template class Xasum; +template class Xasum; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc deleted file mode 100644 index 5b6c9e77..00000000 --- a/src/routines/level1/xaxpy.cc +++ /dev/null @@ -1,113 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xaxpy class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level1/xaxpy.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xaxpy::Xaxpy(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xaxpy"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level1/level1.opencl" - #include "../../kernels/level1/xaxpy.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } - - // Determines whether or not the fast-version can be used - bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && - (y_offset == 0) && (y_inc == 1) && - IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); - - // If possible, run the fast-version of the kernel - auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; - - // Retrieves the Xaxpy kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha_buffer()); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha_buffer()); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast(x_offset)); - kernel.SetArgument(4, static_cast(x_inc)); - kernel.SetArgument(5, y_buffer()); - kernel.SetArgument(6, static_cast(y_offset)); - kernel.SetArgument(7, static_cast(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector{n_ceiled/db_["WPT"]}; - auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xaxpy; -template class Xaxpy; -template class Xaxpy; -template class Xaxpy; -template class Xaxpy; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp new file mode 100644 index 00000000..5b6c9e77 --- /dev/null +++ b/src/routines/level1/xaxpy.cpp @@ -0,0 +1,113 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xaxpy class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level1/xaxpy.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xaxpy::Xaxpy(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xaxpy.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + (y_offset == 0) && (y_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; + + // Retrieves the Xaxpy kernel from the compiled binary + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast(x_offset)); + kernel.SetArgument(4, static_cast(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, static_cast(y_offset)); + kernel.SetArgument(7, static_cast(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, queue_, device_, global, local, event_); + } + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xaxpy; +template class Xaxpy; +template class Xaxpy; +template class Xaxpy; +template class Xaxpy; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc deleted file mode 100644 index 673ef349..00000000 --- a/src/routines/level1/xcopy.cc +++ /dev/null @@ -1,107 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xcopy class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level1/xcopy.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xcopy::Xcopy(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xaxpy"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level1/level1.opencl" - #include "../../kernels/level1/xcopy.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xcopy::DoCopy(const size_t n, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } - - // Determines whether or not the fast-version can be used - bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && - (y_offset == 0) && (y_inc == 1) && - IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); - - // If possible, run the fast-version of the kernel - auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy"; - - // Retrieves the Xcopy kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, static_cast(x_offset)); - kernel.SetArgument(3, static_cast(x_inc)); - kernel.SetArgument(4, y_buffer()); - kernel.SetArgument(5, static_cast(y_offset)); - kernel.SetArgument(6, static_cast(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector{n_ceiled/db_["WPT"]}; - auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xcopy; -template class Xcopy; -template class Xcopy; -template class Xcopy; -template class Xcopy; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level1/xcopy.cpp b/src/routines/level1/xcopy.cpp new file mode 100644 index 00000000..673ef349 --- /dev/null +++ b/src/routines/level1/xcopy.cpp @@ -0,0 +1,107 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xcopy class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level1/xcopy.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xcopy::Xcopy(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xcopy.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xcopy::DoCopy(const size_t n, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + (y_offset == 0) && (y_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy"; + + // Retrieves the Xcopy kernel from the compiled binary + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, static_cast(x_offset)); + kernel.SetArgument(3, static_cast(x_inc)); + kernel.SetArgument(4, y_buffer()); + kernel.SetArgument(5, static_cast(y_offset)); + kernel.SetArgument(6, static_cast(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, queue_, device_, global, local, event_); + } + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xcopy; +template class Xcopy; +template class Xcopy; +template class Xcopy; +template class Xcopy; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc deleted file mode 100644 index bafea157..00000000 --- a/src/routines/level1/xdot.cc +++ /dev/null @@ -1,110 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xdot class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level1/xdot.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xdot::Xdot(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xdot"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level1/xdot.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xdot::DoDot(const size_t n, - const Buffer &dot_buffer, const size_t dot_offset, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, - const bool do_conjugate) { - - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorScalar(1, dot_buffer, dot_offset); - if (ErrorIn(status)) { return status; } - - // Retrieves the Xdot kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel1 = Kernel(program, "Xdot"); - auto kernel2 = Kernel(program, "XdotEpilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer = Buffer(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast(x_offset)); - kernel1.SetArgument(3, static_cast(x_inc)); - kernel1.SetArgument(4, y_buffer()); - kernel1.SetArgument(5, static_cast(y_offset)); - kernel1.SetArgument(6, static_cast(y_inc)); - kernel1.SetArgument(7, temp_buffer()); - kernel1.SetArgument(8, static_cast(do_conjugate)); - - // Event waiting list - auto eventWaitList = std::vector(); - - // Launches the main kernel - auto global1 = std::vector{db_["WGS1"]*temp_size}; - auto local1 = std::vector{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer()); - kernel2.SetArgument(1, dot_buffer()); - kernel2.SetArgument(2, static_cast(dot_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector{db_["WGS2"]}; - auto local2 = std::vector{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xdot; -template class Xdot; -template class Xdot; -template class Xdot; -template class Xdot; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level1/xdot.cpp b/src/routines/level1/xdot.cpp new file mode 100644 index 00000000..bafea157 --- /dev/null +++ b/src/routines/level1/xdot.cpp @@ -0,0 +1,110 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xdot class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level1/xdot.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xdot::Xdot(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xdot"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level1/xdot.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xdot::DoDot(const size_t n, + const Buffer &dot_buffer, const size_t dot_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const bool do_conjugate) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc); + if (ErrorIn(status)) { return status; } + status = TestVectorScalar(1, dot_buffer, dot_offset); + if (ErrorIn(status)) { return status; } + + // Retrieves the Xdot kernels from the compiled binary + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel1 = Kernel(program, "Xdot"); + auto kernel2 = Kernel(program, "XdotEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast(x_offset)); + kernel1.SetArgument(3, static_cast(x_inc)); + kernel1.SetArgument(4, y_buffer()); + kernel1.SetArgument(5, static_cast(y_offset)); + kernel1.SetArgument(6, static_cast(y_inc)); + kernel1.SetArgument(7, temp_buffer()); + kernel1.SetArgument(8, static_cast(do_conjugate)); + + // Event waiting list + auto eventWaitList = std::vector(); + + // Launches the main kernel + auto global1 = std::vector{db_["WGS1"]*temp_size}; + auto local1 = std::vector{db_["WGS1"]}; + auto kernelEvent = Event(); + status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, dot_buffer()); + kernel2.SetArgument(2, static_cast(dot_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector{db_["WGS2"]}; + auto local2 = std::vector{db_["WGS2"]}; + status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xdot; +template class Xdot; +template class Xdot; +template class Xdot; +template class Xdot; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xdotc.cc b/src/routines/level1/xdotc.cc deleted file mode 100644 index 27cf2bab..00000000 --- a/src/routines/level1/xdotc.cc +++ /dev/null @@ -1,49 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xdotc class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level1/xdotc.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xdotc::Xdotc(Queue &queue, EventPointer event, const std::string &name): - Xdot(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xdotc::DoDotc(const size_t n, - const Buffer &dot_buffer, const size_t dot_offset, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - return DoDot(n, dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - true); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xdotc; -template class Xdotc; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level1/xdotc.cpp b/src/routines/level1/xdotc.cpp new file mode 100644 index 00000000..27cf2bab --- /dev/null +++ b/src/routines/level1/xdotc.cpp @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xdotc class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level1/xdotc.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xdotc::Xdotc(Queue &queue, EventPointer event, const std::string &name): + Xdot(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xdotc::DoDotc(const size_t n, + const Buffer &dot_buffer, const size_t dot_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + return DoDot(n, dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + true); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xdotc; +template class Xdotc; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xdotu.cc b/src/routines/level1/xdotu.cc deleted file mode 100644 index 0bce70b7..00000000 --- a/src/routines/level1/xdotu.cc +++ /dev/null @@ -1,48 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xdotu class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level1/xdotu.hpp" - -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xdotu::Xdotu(Queue &queue, EventPointer event, const std::string &name): - Xdot(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xdotu::DoDotu(const size_t n, - const Buffer &dot_buffer, const size_t dot_offset, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - return DoDot(n, dot_buffer, dot_offset, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - false); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xdotu; -template class Xdotu; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level1/xdotu.cpp b/src/routines/level1/xdotu.cpp new file mode 100644 index 00000000..0bce70b7 --- /dev/null +++ b/src/routines/level1/xdotu.cpp @@ -0,0 +1,48 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xdotu class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level1/xdotu.hpp" + +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xdotu::Xdotu(Queue &queue, EventPointer event, const std::string &name): + Xdot(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xdotu::DoDotu(const size_t n, + const Buffer &dot_buffer, const size_t dot_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + return DoDot(n, dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + false); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xdotu; +template class Xdotu; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc deleted file mode 100644 index 97615d8b..00000000 --- a/src/routines/level1/xnrm2.cc +++ /dev/null @@ -1,102 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xnrm2 class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level1/xnrm2.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xnrm2::Xnrm2(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xdot"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level1/xnrm2.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xnrm2::DoNrm2(const size_t n, - const Buffer &nrm2_buffer, const size_t nrm2_offset, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { - - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorScalar(1, nrm2_buffer, nrm2_offset); - if (ErrorIn(status)) { return status; } - - // Retrieves the Xnrm2 kernels from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel1 = Kernel(program, "Xnrm2"); - auto kernel2 = Kernel(program, "Xnrm2Epilogue"); - - // Creates the buffer for intermediate values - auto temp_size = 2*db_["WGS2"]; - auto temp_buffer = Buffer(context_, temp_size); - - // Sets the kernel arguments - kernel1.SetArgument(0, static_cast(n)); - kernel1.SetArgument(1, x_buffer()); - kernel1.SetArgument(2, static_cast(x_offset)); - kernel1.SetArgument(3, static_cast(x_inc)); - kernel1.SetArgument(4, temp_buffer()); - - // Event waiting list - auto eventWaitList = std::vector(); - - // Launches the main kernel - auto global1 = std::vector{db_["WGS1"]*temp_size}; - auto local1 = std::vector{db_["WGS1"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(kernelEvent); - - // Sets the arguments for the epilogue kernel - kernel2.SetArgument(0, temp_buffer()); - kernel2.SetArgument(1, nrm2_buffer()); - kernel2.SetArgument(2, static_cast(nrm2_offset)); - - // Launches the epilogue kernel - auto global2 = std::vector{db_["WGS2"]}; - auto local2 = std::vector{db_["WGS2"]}; - status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xnrm2; -template class Xnrm2; -template class Xnrm2; -template class Xnrm2; -template class Xnrm2; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level1/xnrm2.cpp b/src/routines/level1/xnrm2.cpp new file mode 100644 index 00000000..97615d8b --- /dev/null +++ b/src/routines/level1/xnrm2.cpp @@ -0,0 +1,102 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xnrm2 class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level1/xnrm2.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xnrm2::Xnrm2(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xdot"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level1/xnrm2.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xnrm2::DoNrm2(const size_t n, + const Buffer &nrm2_buffer, const size_t nrm2_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc); + if (ErrorIn(status)) { return status; } + status = TestVectorScalar(1, nrm2_buffer, nrm2_offset); + if (ErrorIn(status)) { return status; } + + // Retrieves the Xnrm2 kernels from the compiled binary + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel1 = Kernel(program, "Xnrm2"); + auto kernel2 = Kernel(program, "Xnrm2Epilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast(x_offset)); + kernel1.SetArgument(3, static_cast(x_inc)); + kernel1.SetArgument(4, temp_buffer()); + + // Event waiting list + auto eventWaitList = std::vector(); + + // Launches the main kernel + auto global1 = std::vector{db_["WGS1"]*temp_size}; + auto local1 = std::vector{db_["WGS1"]}; + auto kernelEvent = Event(); + status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, nrm2_buffer()); + kernel2.SetArgument(2, static_cast(nrm2_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector{db_["WGS2"]}; + auto local2 = std::vector{db_["WGS2"]}; + status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xnrm2; +template class Xnrm2; +template class Xnrm2; +template class Xnrm2; +template class Xnrm2; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc deleted file mode 100644 index bcc43c3b..00000000 --- a/src/routines/level1/xscal.cc +++ /dev/null @@ -1,101 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xscal class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level1/xscal.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xscal::Xscal(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xaxpy"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level1/level1.opencl" - #include "../../kernels/level1/xscal.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xscal::DoScal(const size_t n, const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { - - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Tests the vector for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - - // Determines whether or not the fast-version can be used - bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && - IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); - - // If possible, run the fast-version of the kernel - auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal"; - - // Retrieves the Xscal kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha); - kernel.SetArgument(2, x_buffer()); - } - else { - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast(x_offset)); - kernel.SetArgument(4, static_cast(x_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector{n_ceiled/db_["WPT"]}; - auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xscal; -template class Xscal; -template class Xscal; -template class Xscal; -template class Xscal; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp new file mode 100644 index 00000000..bcc43c3b --- /dev/null +++ b/src/routines/level1/xscal.cpp @@ -0,0 +1,101 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xscal class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level1/xscal.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xscal::Xscal(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xscal.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xscal::DoScal(const size_t n, const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vector for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal"; + + // Retrieves the Xscal kernel from the compiled binary + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + } + else { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast(x_offset)); + kernel.SetArgument(4, static_cast(x_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, queue_, device_, global, local, event_); + } + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xscal; +template class Xscal; +template class Xscal; +template class Xscal; +template class Xscal; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc deleted file mode 100644 index 03907cbd..00000000 --- a/src/routines/level1/xswap.cc +++ /dev/null @@ -1,107 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xswap class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level1/xswap.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xswap::Xswap(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xaxpy"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level1/level1.opencl" - #include "../../kernels/level1/xswap.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xswap::DoSwap(const size_t n, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Tests the vectors for validity - auto status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } - - // Determines whether or not the fast-version can be used - bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && - (y_offset == 0) && (y_inc == 1) && - IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); - - // If possible, run the fast-version of the kernel - auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap"; - - // Retrieves the Xswap kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, y_buffer()); - } - else { - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, x_buffer()); - kernel.SetArgument(2, static_cast(x_offset)); - kernel.SetArgument(3, static_cast(x_inc)); - kernel.SetArgument(4, y_buffer()); - kernel.SetArgument(5, static_cast(y_offset)); - kernel.SetArgument(6, static_cast(y_inc)); - } - - // Launches the kernel - if (use_fast_kernel) { - auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; - auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - else { - auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); - auto global = std::vector{n_ceiled/db_["WPT"]}; - auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - } - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xswap; -template class Xswap; -template class Xswap; -template class Xswap; -template class Xswap; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level1/xswap.cpp b/src/routines/level1/xswap.cpp new file mode 100644 index 00000000..03907cbd --- /dev/null +++ b/src/routines/level1/xswap.cpp @@ -0,0 +1,107 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xswap class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level1/xswap.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xswap::Xswap(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xswap.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xswap::DoSwap(const size_t n, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + (y_offset == 0) && (y_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap"; + + // Retrieves the Xswap kernel from the compiled binary + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, static_cast(x_offset)); + kernel.SetArgument(3, static_cast(x_inc)); + kernel.SetArgument(4, y_buffer()); + kernel.SetArgument(5, static_cast(y_offset)); + kernel.SetArgument(6, static_cast(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, queue_, device_, global, local, event_); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, queue_, device_, global, local, event_); + } + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xswap; +template class Xswap; +template class Xswap; +template class Xswap; +template class Xswap; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc deleted file mode 100644 index ea4f001c..00000000 --- a/src/routines/level2/xgbmv.cc +++ /dev/null @@ -1,68 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgbmv class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xgbmv.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xgbmv::Xgbmv(Queue &queue, EventPointer event, const std::string &name): - Xgemv(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xgbmv::DoGbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // Reverses the upper and lower band count - auto rotated = (layout == Layout::kRowMajor); - auto kl_real = (rotated) ? ku : kl; - auto ku_real = (rotated) ? kl : ku; - - // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. - // The specific hermitian matrix-accesses are implemented in the kernel guarded by the - // ROUTINE_GBMV define. - bool fast_kernels = false; - return MatVec(layout, a_transpose, - m, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - 0, false, kl_real, ku_real); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xgbmv; -template class Xgbmv; -template class Xgbmv; -template class Xgbmv; -template class Xgbmv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xgbmv.cpp b/src/routines/level2/xgbmv.cpp new file mode 100644 index 00000000..ea4f001c --- /dev/null +++ b/src/routines/level2/xgbmv.cpp @@ -0,0 +1,68 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xgbmv.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xgbmv::Xgbmv(Queue &queue, EventPointer event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xgbmv::DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Reverses the upper and lower band count + auto rotated = (layout == Layout::kRowMajor); + auto kl_real = (rotated) ? ku : kl; + auto ku_real = (rotated) ? kl : ku; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_GBMV define. + bool fast_kernels = false; + return MatVec(layout, a_transpose, + m, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + 0, false, kl_real, ku_real); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgbmv; +template class Xgbmv; +template class Xgbmv; +template class Xgbmv; +template class Xgbmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc deleted file mode 100644 index 21fb397c..00000000 --- a/src/routines/level2/xgemv.cc +++ /dev/null @@ -1,181 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgemv class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xgemv.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xgemv::Xgemv(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Pad", "Xgemv"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level2/xgemv.opencl" - #include "../../kernels/level2/xgemv_fast.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // Performs the matrix-vector multiplication - return MatVec(layout, a_transpose, - m, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - true, true, - 0, false, 0, 0); // N/A for this routine -} - -// ================================================================================================= - -// The generic implementation, also suited for other (non general) matrix-vector multiplications -template -StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, - bool fast_kernel, bool fast_kernel_rot, - const size_t parameter, const bool packed, - const size_t kl, const size_t ku) { - - // Makes sure all dimensions are larger than zero - if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrix has an alternative layout (row or column-major). - auto a_altlayout = (layout == Layout::kRowMajor); - auto a_one = (a_altlayout) ? n : m; - auto a_two = (a_altlayout) ? m : n; - - // Swap m and n if the matrix is transposed - auto a_transposed = (a_transpose != Transpose::kNo); - auto m_real = (a_transposed) ? n : m; - auto n_real = (a_transposed) ? m : n; - - // Special adjustments for banded matrices - if (kl != 0 || ku != 0) { - a_one = kl+ku+1; - } - - // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator) - auto a_rotated = a_transposed ^ a_altlayout; - - // In case of complex data-types, the transpose can also become a conjugate transpose - auto a_conjugate = (a_transpose == Transpose::kConjugate); - - // Tests the matrix and the vectors for validity - auto status = StatusCode::kSuccess; - if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } - else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); } - if (ErrorIn(status)) { return status; } - status = TestVectorX(n_real, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(m_real, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } - - // Determines whether or not the fast-version can be used - fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && - IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && - IsMultiple(n, db_["WGS2"]) && - IsMultiple(a_ld, db_["VW2"]); - fast_kernel_rot = fast_kernel_rot && (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && - IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && - IsMultiple(n, db_["WGS3"]) && - IsMultiple(a_ld, db_["VW3"]); - - // If possible, run the fast-version (rotated or non-rotated) of the kernel - auto kernel_name = "Xgemv"; - auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]); - auto global_size = m_ceiled / db_["WPT1"]; - auto local_size = db_["WGS1"]; - if (fast_kernel) { - kernel_name = "XgemvFast"; - global_size = m_real / db_["WPT2"]; - local_size = db_["WGS2"]; - } - if (fast_kernel_rot) { - kernel_name = "XgemvFastRot"; - global_size = m_real / db_["WPT3"]; - local_size = db_["WGS3"]; - } - - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - - // Retrieves the Xgemv kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(m_real)); - kernel.SetArgument(1, static_cast(n_real)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); - kernel.SetArgument(4, static_cast(a_rotated)); - kernel.SetArgument(5, a_buffer()); - kernel.SetArgument(6, static_cast(a_offset)); - kernel.SetArgument(7, static_cast(a_ld)); - kernel.SetArgument(8, x_buffer()); - kernel.SetArgument(9, static_cast(x_offset)); - kernel.SetArgument(10, static_cast(x_inc)); - kernel.SetArgument(11, y_buffer()); - kernel.SetArgument(12, static_cast(y_offset)); - kernel.SetArgument(13, static_cast(y_inc)); - kernel.SetArgument(14, static_cast(a_conjugate)); - kernel.SetArgument(15, static_cast(parameter)); // extra parameter used for symm/herm - kernel.SetArgument(16, static_cast(kl)); // only used for banded matrices - kernel.SetArgument(17, static_cast(ku)); // only used for banded matrices - - // Launches the kernel - auto global = std::vector{global_size}; - auto local = std::vector{local_size}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xgemv; -template class Xgemv; -template class Xgemv; -template class Xgemv; -template class Xgemv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp new file mode 100644 index 00000000..21fb397c --- /dev/null +++ b/src/routines/level2/xgemv.cpp @@ -0,0 +1,181 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgemv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xgemv.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xgemv::Xgemv(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Pad", "Xgemv"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level2/xgemv.opencl" + #include "../../kernels/level2/xgemv_fast.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xgemv::DoGemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Performs the matrix-vector multiplication + return MatVec(layout, a_transpose, + m, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + true, true, + 0, false, 0, 0); // N/A for this routine +} + +// ================================================================================================= + +// The generic implementation, also suited for other (non general) matrix-vector multiplications +template +StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + bool fast_kernel, bool fast_kernel_rot, + const size_t parameter, const bool packed, + const size_t kl, const size_t ku) { + + // Makes sure all dimensions are larger than zero + if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrix has an alternative layout (row or column-major). + auto a_altlayout = (layout == Layout::kRowMajor); + auto a_one = (a_altlayout) ? n : m; + auto a_two = (a_altlayout) ? m : n; + + // Swap m and n if the matrix is transposed + auto a_transposed = (a_transpose != Transpose::kNo); + auto m_real = (a_transposed) ? n : m; + auto n_real = (a_transposed) ? m : n; + + // Special adjustments for banded matrices + if (kl != 0 || ku != 0) { + a_one = kl+ku+1; + } + + // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator) + auto a_rotated = a_transposed ^ a_altlayout; + + // In case of complex data-types, the transpose can also become a conjugate transpose + auto a_conjugate = (a_transpose == Transpose::kConjugate); + + // Tests the matrix and the vectors for validity + auto status = StatusCode::kSuccess; + if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } + else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); } + if (ErrorIn(status)) { return status; } + status = TestVectorX(n_real, x_buffer, x_offset, x_inc); + if (ErrorIn(status)) { return status; } + status = TestVectorY(m_real, y_buffer, y_offset, y_inc); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && + IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && + IsMultiple(n, db_["WGS2"]) && + IsMultiple(a_ld, db_["VW2"]); + fast_kernel_rot = fast_kernel_rot && (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && + IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && + IsMultiple(n, db_["WGS3"]) && + IsMultiple(a_ld, db_["VW3"]); + + // If possible, run the fast-version (rotated or non-rotated) of the kernel + auto kernel_name = "Xgemv"; + auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]); + auto global_size = m_ceiled / db_["WPT1"]; + auto local_size = db_["WGS1"]; + if (fast_kernel) { + kernel_name = "XgemvFast"; + global_size = m_real / db_["WPT2"]; + local_size = db_["WGS2"]; + } + if (fast_kernel_rot) { + kernel_name = "XgemvFastRot"; + global_size = m_real / db_["WPT3"]; + local_size = db_["WGS3"]; + } + + // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + auto beta_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + beta_buffer.Write(queue_, 1, &beta); + + // Retrieves the Xgemv kernel from the compiled binary + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(m_real)); + kernel.SetArgument(1, static_cast(n_real)); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(4, static_cast(a_rotated)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast(a_offset)); + kernel.SetArgument(7, static_cast(a_ld)); + kernel.SetArgument(8, x_buffer()); + kernel.SetArgument(9, static_cast(x_offset)); + kernel.SetArgument(10, static_cast(x_inc)); + kernel.SetArgument(11, y_buffer()); + kernel.SetArgument(12, static_cast(y_offset)); + kernel.SetArgument(13, static_cast(y_inc)); + kernel.SetArgument(14, static_cast(a_conjugate)); + kernel.SetArgument(15, static_cast(parameter)); // extra parameter used for symm/herm + kernel.SetArgument(16, static_cast(kl)); // only used for banded matrices + kernel.SetArgument(17, static_cast(ku)); // only used for banded matrices + + // Launches the kernel + auto global = std::vector{global_size}; + auto local = std::vector{local_size}; + status = RunKernel(kernel, queue_, device_, global, local, event_); + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgemv; +template class Xgemv; +template class Xgemv; +template class Xgemv; +template class Xgemv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc deleted file mode 100644 index 353047d2..00000000 --- a/src/routines/level2/xger.cc +++ /dev/null @@ -1,106 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xger class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xger.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xger::Xger(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xger"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level2/level2.opencl" - #include "../../kernels/level2/xger.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xger::DoGer(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { - - // Makes sure all dimensions are larger than zero - if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrix has an alternative layout (row or column-major). - const auto a_is_rowmajor = (layout == Layout::kRowMajor); - const auto a_one = (a_is_rowmajor) ? n : m; - const auto a_two = (a_is_rowmajor) ? m : n; - - // Tests the matrix and the vectors for validity - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestVectorX(m, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } - - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - - // Retrieves the kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel = Kernel(program, "Xger"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(a_one)); - kernel.SetArgument(1, static_cast(a_two)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, x_buffer()); - kernel.SetArgument(4, static_cast(x_offset)); - kernel.SetArgument(5, static_cast(x_inc)); - kernel.SetArgument(6, y_buffer()); - kernel.SetArgument(7, static_cast(y_offset)); - kernel.SetArgument(8, static_cast(y_inc)); - kernel.SetArgument(9, a_buffer()); - kernel.SetArgument(10, static_cast(a_offset)); - kernel.SetArgument(11, static_cast(a_ld)); - kernel.SetArgument(12, static_cast(a_is_rowmajor)); - - // Launches the kernel - auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]); - auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]); - auto global = std::vector{a_one_ceiled, a_two_ceiled}; - auto local = std::vector{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xger; -template class Xger; -template class Xger; -template class Xger; -template class Xger; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp new file mode 100644 index 00000000..353047d2 --- /dev/null +++ b/src/routines/level2/xger.cpp @@ -0,0 +1,106 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xger class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xger.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xger::Xger(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xger"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level2/level2.opencl" + #include "../../kernels/level2/xger.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xger::DoGer(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { + + // Makes sure all dimensions are larger than zero + if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrix has an alternative layout (row or column-major). + const auto a_is_rowmajor = (layout == Layout::kRowMajor); + const auto a_one = (a_is_rowmajor) ? n : m; + const auto a_two = (a_is_rowmajor) ? m : n; + + // Tests the matrix and the vectors for validity + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + if (ErrorIn(status)) { return status; } + status = TestVectorX(m, x_buffer, x_offset, x_inc); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc); + if (ErrorIn(status)) { return status; } + + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + + // Retrieves the kernel from the compiled binary + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel = Kernel(program, "Xger"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(a_one)); + kernel.SetArgument(1, static_cast(a_two)); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, x_buffer()); + kernel.SetArgument(4, static_cast(x_offset)); + kernel.SetArgument(5, static_cast(x_inc)); + kernel.SetArgument(6, y_buffer()); + kernel.SetArgument(7, static_cast(y_offset)); + kernel.SetArgument(8, static_cast(y_inc)); + kernel.SetArgument(9, a_buffer()); + kernel.SetArgument(10, static_cast(a_offset)); + kernel.SetArgument(11, static_cast(a_ld)); + kernel.SetArgument(12, static_cast(a_is_rowmajor)); + + // Launches the kernel + auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]); + auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]); + auto global = std::vector{a_one_ceiled, a_two_ceiled}; + auto local = std::vector{db_["WGS1"], db_["WGS2"]}; + status = RunKernel(kernel, queue_, device_, global, local, event_); + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xger; +template class Xger; +template class Xger; +template class Xger; +template class Xger; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xgerc.cc b/src/routines/level2/xgerc.cc deleted file mode 100644 index d9feda97..00000000 --- a/src/routines/level2/xgerc.cc +++ /dev/null @@ -1,53 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgerc class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xgerc.hpp" - -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xgerc::Xgerc(Queue &queue, EventPointer event, const std::string &name): - Xger(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xgerc::DoGerc(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { - - // Regular Ger operation on complex data, plus conjugation in the kernel guarded by the - // ROUTINE_GERC guard. - return DoGer(layout, m, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xgerc; -template class Xgerc; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xgerc.cpp b/src/routines/level2/xgerc.cpp new file mode 100644 index 00000000..d9feda97 --- /dev/null +++ b/src/routines/level2/xgerc.cpp @@ -0,0 +1,53 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgerc class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xgerc.hpp" + +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xgerc::Xgerc(Queue &queue, EventPointer event, const std::string &name): + Xger(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xgerc::DoGerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { + + // Regular Ger operation on complex data, plus conjugation in the kernel guarded by the + // ROUTINE_GERC guard. + return DoGer(layout, m, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgerc; +template class Xgerc; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xgeru.cc b/src/routines/level2/xgeru.cc deleted file mode 100644 index da9e91c2..00000000 --- a/src/routines/level2/xgeru.cc +++ /dev/null @@ -1,52 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgeru class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xgeru.hpp" - -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xgeru::Xgeru(Queue &queue, EventPointer event, const std::string &name): - Xger(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xgeru::DoGeru(const Layout layout, - const size_t m, const size_t n, - const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { - - // Regular Ger operation on complex data - return DoGer(layout, m, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xgeru; -template class Xgeru; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xgeru.cpp b/src/routines/level2/xgeru.cpp new file mode 100644 index 00000000..da9e91c2 --- /dev/null +++ b/src/routines/level2/xgeru.cpp @@ -0,0 +1,52 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgeru class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xgeru.hpp" + +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xgeru::Xgeru(Queue &queue, EventPointer event, const std::string &name): + Xger(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xgeru::DoGeru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { + + // Regular Ger operation on complex data + return DoGer(layout, m, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgeru; +template class Xgeru; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc deleted file mode 100644 index f6c0e3c4..00000000 --- a/src/routines/level2/xhbmv.cc +++ /dev/null @@ -1,64 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xhbmv class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xhbmv.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xhbmv::Xhbmv(Queue &queue, EventPointer event, const std::string &name): - Xgemv(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xhbmv::DoHbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // The data is either in the upper or lower triangle - size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - - // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. - // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the - // ROUTINE_HBMV define. - bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, false, k, 0); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xhbmv; -template class Xhbmv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xhbmv.cpp b/src/routines/level2/xhbmv.cpp new file mode 100644 index 00000000..f6c0e3c4 --- /dev/null +++ b/src/routines/level2/xhbmv.cpp @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xhbmv.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xhbmv::Xhbmv(Queue &queue, EventPointer event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xhbmv::DoHbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_HBMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, k, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhbmv; +template class Xhbmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc deleted file mode 100644 index 2cbcf7b4..00000000 --- a/src/routines/level2/xhemv.cc +++ /dev/null @@ -1,64 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xhemv class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xhemv.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xhemv::Xhemv(Queue &queue, EventPointer event, const std::string &name): - Xgemv(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xhemv::DoHemv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // The data is either in the upper or lower triangle - size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - - // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. - // The specific hermitian matrix-accesses are implemented in the kernel guarded by the - // ROUTINE_HEMV define. - bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, false, 0, 0); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xhemv; -template class Xhemv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xhemv.cpp b/src/routines/level2/xhemv.cpp new file mode 100644 index 00000000..2cbcf7b4 --- /dev/null +++ b/src/routines/level2/xhemv.cpp @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhemv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xhemv.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xhemv::Xhemv(Queue &queue, EventPointer event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xhemv::DoHemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_HEMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, 0, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhemv; +template class Xhemv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc deleted file mode 100644 index ed8ba9e9..00000000 --- a/src/routines/level2/xher.cc +++ /dev/null @@ -1,117 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xher class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xher.hpp" - -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xher::Xher(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xger"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level2/level2.opencl" - #include "../../kernels/level2/xher.opencl" - ; -} - -// ================================================================================================= - -// Specializations to compute alpha of type 'T' -template <> float2 Xher::GetAlpha(const float alpha) { return float2{alpha, 0.0f}; } -template <> double2 Xher::GetAlpha(const double alpha) { return double2{alpha, 0.0}; } -template <> float Xher::GetAlpha(const float alpha) { return alpha; } -template <> double Xher::GetAlpha(const double alpha) { return alpha; } -template <> half Xher::GetAlpha(const half alpha) { return alpha; } - -// ================================================================================================= - -// The main routine -template -StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, - const size_t n, - const U alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const bool packed) { - - // Makes sure the dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // The data is either in the upper or lower triangle - const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - const auto is_rowmajor = (layout == Layout::kRowMajor); - - // Tests the matrix and the vectors for validity - auto status = StatusCode::kSuccess; - if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } - else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); } - if (ErrorIn(status)) { return status; } - status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - - // If alpha is zero an update is not required - if (alpha == U{0}) { return StatusCode::kSuccess; } - - // Creates a matching version of alpha - const auto matching_alpha = GetAlpha(alpha); - - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &matching_alpha); - - // Retrieves the kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel = Kernel(program, "Xher"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha_buffer()); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast(x_offset)); - kernel.SetArgument(4, static_cast(x_inc)); - kernel.SetArgument(5, a_buffer()); - kernel.SetArgument(6, static_cast(a_offset)); - kernel.SetArgument(7, static_cast(a_ld)); - kernel.SetArgument(8, static_cast(is_upper)); - kernel.SetArgument(9, static_cast(is_rowmajor)); - - // Launches the kernel - auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); - auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); - auto global = std::vector{global_one, global_two}; - auto local = std::vector{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xher; -template class Xher; -template class Xher; -template class Xher; -template class Xher; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp new file mode 100644 index 00000000..ed8ba9e9 --- /dev/null +++ b/src/routines/level2/xher.cpp @@ -0,0 +1,117 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xher class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xher.hpp" + +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xher::Xher(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xger"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level2/level2.opencl" + #include "../../kernels/level2/xher.opencl" + ; +} + +// ================================================================================================= + +// Specializations to compute alpha of type 'T' +template <> float2 Xher::GetAlpha(const float alpha) { return float2{alpha, 0.0f}; } +template <> double2 Xher::GetAlpha(const double alpha) { return double2{alpha, 0.0}; } +template <> float Xher::GetAlpha(const float alpha) { return alpha; } +template <> double Xher::GetAlpha(const double alpha) { return alpha; } +template <> half Xher::GetAlpha(const half alpha) { return alpha; } + +// ================================================================================================= + +// The main routine +template +StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, + const size_t n, + const U alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const bool packed) { + + // Makes sure the dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // The data is either in the upper or lower triangle + const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + const auto is_rowmajor = (layout == Layout::kRowMajor); + + // Tests the matrix and the vectors for validity + auto status = StatusCode::kSuccess; + if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } + else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); } + if (ErrorIn(status)) { return status; } + status = TestVectorX(n, x_buffer, x_offset, x_inc); + if (ErrorIn(status)) { return status; } + + // If alpha is zero an update is not required + if (alpha == U{0}) { return StatusCode::kSuccess; } + + // Creates a matching version of alpha + const auto matching_alpha = GetAlpha(alpha); + + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &matching_alpha); + + // Retrieves the kernel from the compiled binary + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel = Kernel(program, "Xher"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast(x_offset)); + kernel.SetArgument(4, static_cast(x_inc)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast(a_offset)); + kernel.SetArgument(7, static_cast(a_ld)); + kernel.SetArgument(8, static_cast(is_upper)); + kernel.SetArgument(9, static_cast(is_rowmajor)); + + // Launches the kernel + auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); + auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); + auto global = std::vector{global_one, global_two}; + auto local = std::vector{db_["WGS1"], db_["WGS2"]}; + status = RunKernel(kernel, queue_, device_, global, local, event_); + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xher; +template class Xher; +template class Xher; +template class Xher; +template class Xher; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc deleted file mode 100644 index 50572cea..00000000 --- a/src/routines/level2/xher2.cc +++ /dev/null @@ -1,108 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xher2 class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xher2.hpp" - -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xher2::Xher2(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xger"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level2/level2.opencl" - #include "../../kernels/level2/xher2.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xher2::DoHer2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const bool packed) { - - // Makes sure the dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // The data is either in the upper or lower triangle - const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - const auto is_rowmajor = (layout == Layout::kRowMajor); - - // Tests the matrix and the vectors for validity - auto status = StatusCode::kSuccess; - if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } - else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); } - if (ErrorIn(status)) { return status; } - status = TestVectorX(n, x_buffer, x_offset, x_inc); - if (ErrorIn(status)) { return status; } - status = TestVectorY(n, y_buffer, y_offset, y_inc); - if (ErrorIn(status)) { return status; } - - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - - // Retrieves the kernel from the compiled binary - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel = Kernel(program, "Xher2"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha_buffer()); - kernel.SetArgument(2, x_buffer()); - kernel.SetArgument(3, static_cast(x_offset)); - kernel.SetArgument(4, static_cast(x_inc)); - kernel.SetArgument(5, y_buffer()); - kernel.SetArgument(6, static_cast(y_offset)); - kernel.SetArgument(7, static_cast(y_inc)); - kernel.SetArgument(8, a_buffer()); - kernel.SetArgument(9, static_cast(a_offset)); - kernel.SetArgument(10, static_cast(a_ld)); - kernel.SetArgument(11, static_cast(is_upper)); - kernel.SetArgument(12, static_cast(is_rowmajor)); - - // Launches the kernel - auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); - auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); - auto global = std::vector{global_one, global_two}; - auto local = std::vector{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, queue_, device_, global, local, event_); - if (ErrorIn(status)) { return status; } - - // Succesfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xher2; -template class Xher2; -template class Xher2; -template class Xher2; -template class Xher2; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp new file mode 100644 index 00000000..50572cea --- /dev/null +++ b/src/routines/level2/xher2.cpp @@ -0,0 +1,108 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xher2 class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xher2.hpp" + +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xher2::Xher2(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xger"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level2/level2.opencl" + #include "../../kernels/level2/xher2.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xher2::DoHer2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const bool packed) { + + // Makes sure the dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // The data is either in the upper or lower triangle + const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + const auto is_rowmajor = (layout == Layout::kRowMajor); + + // Tests the matrix and the vectors for validity + auto status = StatusCode::kSuccess; + if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } + else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); } + if (ErrorIn(status)) { return status; } + status = TestVectorX(n, x_buffer, x_offset, x_inc); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc); + if (ErrorIn(status)) { return status; } + + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + + // Retrieves the kernel from the compiled binary + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel = Kernel(program, "Xher2"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast(x_offset)); + kernel.SetArgument(4, static_cast(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, static_cast(y_offset)); + kernel.SetArgument(7, static_cast(y_inc)); + kernel.SetArgument(8, a_buffer()); + kernel.SetArgument(9, static_cast(a_offset)); + kernel.SetArgument(10, static_cast(a_ld)); + kernel.SetArgument(11, static_cast(is_upper)); + kernel.SetArgument(12, static_cast(is_rowmajor)); + + // Launches the kernel + auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); + auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); + auto global = std::vector{global_one, global_two}; + auto local = std::vector{db_["WGS1"], db_["WGS2"]}; + status = RunKernel(kernel, queue_, device_, global, local, event_); + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xher2; +template class Xher2; +template class Xher2; +template class Xher2; +template class Xher2; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc deleted file mode 100644 index e6f82b34..00000000 --- a/src/routines/level2/xhpmv.cc +++ /dev/null @@ -1,64 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xhpmv class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xhpmv.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xhpmv::Xhpmv(Queue &queue, EventPointer event, const std::string &name): - Xgemv(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xhpmv::DoHpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer &ap_buffer, const size_t ap_offset, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // The data is either in the upper or lower triangle - size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - - // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. - // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the - // ROUTINE_HPMV define. - bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - ap_buffer, ap_offset, n, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, true, 0, 0); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xhpmv; -template class Xhpmv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xhpmv.cpp b/src/routines/level2/xhpmv.cpp new file mode 100644 index 00000000..e6f82b34 --- /dev/null +++ b/src/routines/level2/xhpmv.cpp @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhpmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xhpmv.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xhpmv::Xhpmv(Queue &queue, EventPointer event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xhpmv::DoHpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &ap_buffer, const size_t ap_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_HPMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + ap_buffer, ap_offset, n, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, true, 0, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhpmv; +template class Xhpmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xhpr.cc b/src/routines/level2/xhpr.cc deleted file mode 100644 index 225ebfe5..00000000 --- a/src/routines/level2/xhpr.cc +++ /dev/null @@ -1,51 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xhpr class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xhpr.hpp" - -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xhpr::Xhpr(Queue &queue, EventPointer event, const std::string &name): - Xher(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xhpr::DoHpr(const Layout layout, const Triangle triangle, - const size_t n, - const U alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &ap_buffer, const size_t ap_offset) { - - // Specific Xhpr functionality is implemented in the kernel using defines - return DoHer(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, n, - true); // packed matrix -} - -// ================================================================================================= - -// Compiles the templated class -template class Xhpr; -template class Xhpr; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xhpr.cpp b/src/routines/level2/xhpr.cpp new file mode 100644 index 00000000..225ebfe5 --- /dev/null +++ b/src/routines/level2/xhpr.cpp @@ -0,0 +1,51 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhpr class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xhpr.hpp" + +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xhpr::Xhpr(Queue &queue, EventPointer event, const std::string &name): + Xher(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xhpr::DoHpr(const Layout layout, const Triangle triangle, + const size_t n, + const U alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &ap_buffer, const size_t ap_offset) { + + // Specific Xhpr functionality is implemented in the kernel using defines + return DoHer(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, n, + true); // packed matrix +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhpr; +template class Xhpr; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xhpr2.cc b/src/routines/level2/xhpr2.cc deleted file mode 100644 index 85f9d3f9..00000000 --- a/src/routines/level2/xhpr2.cc +++ /dev/null @@ -1,53 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xhpr2 class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xhpr2.hpp" - -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xhpr2::Xhpr2(Queue &queue, EventPointer event, const std::string &name): - Xher2(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xhpr2::DoHpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer &ap_buffer, const size_t ap_offset) { - - // Specific Xhpr2 functionality is implemented in the kernel using defines - return DoHer2(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, n, - true); // packed matrix -} - -// ================================================================================================= - -// Compiles the templated class -template class Xhpr2; -template class Xhpr2; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xhpr2.cpp b/src/routines/level2/xhpr2.cpp new file mode 100644 index 00000000..85f9d3f9 --- /dev/null +++ b/src/routines/level2/xhpr2.cpp @@ -0,0 +1,53 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhpr2 class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xhpr2.hpp" + +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xhpr2::Xhpr2(Queue &queue, EventPointer event, const std::string &name): + Xher2(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xhpr2::DoHpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &ap_buffer, const size_t ap_offset) { + + // Specific Xhpr2 functionality is implemented in the kernel using defines + return DoHer2(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, n, + true); // packed matrix +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhpr2; +template class Xhpr2; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc deleted file mode 100644 index 28730899..00000000 --- a/src/routines/level2/xsbmv.cc +++ /dev/null @@ -1,65 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsbmv class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xsbmv.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsbmv::Xsbmv(Queue &queue, EventPointer event, const std::string &name): - Xgemv(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsbmv::DoSbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // The data is either in the upper or lower triangle - size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - - // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. - // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the - // ROUTINE_SBMV define. - bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, false, k, 0); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsbmv; -template class Xsbmv; -template class Xsbmv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xsbmv.cpp b/src/routines/level2/xsbmv.cpp new file mode 100644 index 00000000..28730899 --- /dev/null +++ b/src/routines/level2/xsbmv.cpp @@ -0,0 +1,65 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xsbmv.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsbmv::Xsbmv(Queue &queue, EventPointer event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsbmv::DoSbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_SBMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, k, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsbmv; +template class Xsbmv; +template class Xsbmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc deleted file mode 100644 index f6651012..00000000 --- a/src/routines/level2/xspmv.cc +++ /dev/null @@ -1,65 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xspmv class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xspmv.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xspmv::Xspmv(Queue &queue, EventPointer event, const std::string &name): - Xgemv(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xspmv::DoSpmv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer &ap_buffer, const size_t ap_offset, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // The data is either in the upper or lower triangle - size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - - // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. - // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the - // ROUTINE_SPMV define. - bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - ap_buffer, ap_offset, n, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, true, 0, 0); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xspmv; -template class Xspmv; -template class Xspmv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xspmv.cpp b/src/routines/level2/xspmv.cpp new file mode 100644 index 00000000..f6651012 --- /dev/null +++ b/src/routines/level2/xspmv.cpp @@ -0,0 +1,65 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xspmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xspmv.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xspmv::Xspmv(Queue &queue, EventPointer event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xspmv::DoSpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &ap_buffer, const size_t ap_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_SPMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + ap_buffer, ap_offset, n, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, true, 0, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xspmv; +template class Xspmv; +template class Xspmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xspr.cc b/src/routines/level2/xspr.cc deleted file mode 100644 index a75fe9c3..00000000 --- a/src/routines/level2/xspr.cc +++ /dev/null @@ -1,52 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xspr class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xspr.hpp" - -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xspr::Xspr(Queue &queue, EventPointer event, const std::string &name): - Xher(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xspr::DoSpr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &ap_buffer, const size_t ap_offset) { - - // Specific Xspr functionality is implemented in the kernel using defines - return DoHer(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - ap_buffer, ap_offset, n, - true); // packed matrix -} - -// ================================================================================================= - -// Compiles the templated class -template class Xspr; -template class Xspr; -template class Xspr; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xspr.cpp b/src/routines/level2/xspr.cpp new file mode 100644 index 00000000..a75fe9c3 --- /dev/null +++ b/src/routines/level2/xspr.cpp @@ -0,0 +1,52 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xspr class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xspr.hpp" + +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xspr::Xspr(Queue &queue, EventPointer event, const std::string &name): + Xher(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xspr::DoSpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &ap_buffer, const size_t ap_offset) { + + // Specific Xspr functionality is implemented in the kernel using defines + return DoHer(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, n, + true); // packed matrix +} + +// ================================================================================================= + +// Compiles the templated class +template class Xspr; +template class Xspr; +template class Xspr; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xspr2.cc b/src/routines/level2/xspr2.cc deleted file mode 100644 index c39a2eb4..00000000 --- a/src/routines/level2/xspr2.cc +++ /dev/null @@ -1,54 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xspr2 class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xspr2.hpp" - -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xspr2::Xspr2(Queue &queue, EventPointer event, const std::string &name): - Xher2(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xspr2::DoSpr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer &ap_buffer, const size_t ap_offset) { - - // Specific Xspr2 functionality is implemented in the kernel using defines - return DoHer2(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - ap_buffer, ap_offset, n, - true); // packed matrix -} - -// ================================================================================================= - -// Compiles the templated class -template class Xspr2; -template class Xspr2; -template class Xspr2; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xspr2.cpp b/src/routines/level2/xspr2.cpp new file mode 100644 index 00000000..c39a2eb4 --- /dev/null +++ b/src/routines/level2/xspr2.cpp @@ -0,0 +1,54 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xspr2 class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xspr2.hpp" + +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xspr2::Xspr2(Queue &queue, EventPointer event, const std::string &name): + Xher2(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xspr2::DoSpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &ap_buffer, const size_t ap_offset) { + + // Specific Xspr2 functionality is implemented in the kernel using defines + return DoHer2(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, n, + true); // packed matrix +} + +// ================================================================================================= + +// Compiles the templated class +template class Xspr2; +template class Xspr2; +template class Xspr2; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc deleted file mode 100644 index 648d2a3e..00000000 --- a/src/routines/level2/xsymv.cc +++ /dev/null @@ -1,65 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsymv class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xsymv.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsymv::Xsymv(Queue &queue, EventPointer event, const std::string &name): - Xgemv(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsymv::DoSymv(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { - - // The data is either in the upper or lower triangle - size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - - // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. - // The specific symmetric matrix-accesses are implemented in the kernel guarded by the - // ROUTINE_SYMV define. - bool fast_kernels = false; - return MatVec(layout, Transpose::kNo, - n, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc, - fast_kernels, fast_kernels, - is_upper, false, 0, 0); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsymv; -template class Xsymv; -template class Xsymv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xsymv.cpp b/src/routines/level2/xsymv.cpp new file mode 100644 index 00000000..648d2a3e --- /dev/null +++ b/src/routines/level2/xsymv.cpp @@ -0,0 +1,65 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsymv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xsymv.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsymv::Xsymv(Queue &queue, EventPointer event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsymv::DoSymv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific symmetric matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_SYMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, 0, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsymv; +template class Xsymv; +template class Xsymv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xsyr.cc b/src/routines/level2/xsyr.cc deleted file mode 100644 index 758d8f8f..00000000 --- a/src/routines/level2/xsyr.cc +++ /dev/null @@ -1,51 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsyr class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xsyr.hpp" - -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsyr::Xsyr(Queue &queue, EventPointer event, const std::string &name): - Xher(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsyr::DoSyr(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { - - // Specific Xsyr functionality is implemented in the kernel using defines - return DoHer(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - a_buffer, a_offset, a_ld); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsyr; -template class Xsyr; -template class Xsyr; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xsyr.cpp b/src/routines/level2/xsyr.cpp new file mode 100644 index 00000000..758d8f8f --- /dev/null +++ b/src/routines/level2/xsyr.cpp @@ -0,0 +1,51 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xsyr.hpp" + +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsyr::Xsyr(Queue &queue, EventPointer event, const std::string &name): + Xher(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsyr::DoSyr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { + + // Specific Xsyr functionality is implemented in the kernel using defines + return DoHer(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyr; +template class Xsyr; +template class Xsyr; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xsyr2.cc b/src/routines/level2/xsyr2.cc deleted file mode 100644 index 6f43b219..00000000 --- a/src/routines/level2/xsyr2.cc +++ /dev/null @@ -1,53 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsyr2 class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xsyr2.hpp" - -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsyr2::Xsyr2(Queue &queue, EventPointer event, const std::string &name): - Xher2(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsyr2::DoSyr2(const Layout layout, const Triangle triangle, - const size_t n, - const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { - - // Specific Xsyr2 functionality is implemented in the kernel using defines - return DoHer2(layout, triangle, n, alpha, - x_buffer, x_offset, x_inc, - y_buffer, y_offset, y_inc, - a_buffer, a_offset, a_ld); -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsyr2; -template class Xsyr2; -template class Xsyr2; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xsyr2.cpp b/src/routines/level2/xsyr2.cpp new file mode 100644 index 00000000..6f43b219 --- /dev/null +++ b/src/routines/level2/xsyr2.cpp @@ -0,0 +1,53 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2 class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xsyr2.hpp" + +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsyr2::Xsyr2(Queue &queue, EventPointer event, const std::string &name): + Xher2(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsyr2::DoSyr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld) { + + // Specific Xsyr2 functionality is implemented in the kernel using defines + return DoHer2(layout, triangle, n, alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyr2; +template class Xsyr2; +template class Xsyr2; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc deleted file mode 100644 index e315c544..00000000 --- a/src/routines/level2/xtbmv.cc +++ /dev/null @@ -1,82 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xtbmv class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xtbmv.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xtbmv::Xtbmv(Queue &queue, EventPointer event, const std::string &name): - Xgemv(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xtbmv::DoTbmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { - - // Creates a copy of X: a temporary scratch buffer - auto scratch_buffer = Buffer(context_, n*x_inc + x_offset); - try { - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); - } catch (...) { } // Continues: error-code is returned in MatVec - - // The data is either in the upper or lower triangle - size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - - // Adds '2' to the parameter if the diagonal is unit - auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; - - // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. - // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the - // ROUTINE_TBMV define. - auto fast_kernels = false; - auto status = MatVec(layout, a_transpose, - n, n, static_cast(1), - a_buffer, a_offset, a_ld, - scratch_buffer, x_offset, x_inc, static_cast(0), - x_buffer, x_offset, x_inc, - fast_kernels, fast_kernels, - parameter, false, k, 0); - - // Returns the proper error code (renames vector Y to X) - switch(status) { - case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; - case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; - case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; - default: return status; - } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xtbmv; -template class Xtbmv; -template class Xtbmv; -template class Xtbmv; -template class Xtbmv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xtbmv.cpp b/src/routines/level2/xtbmv.cpp new file mode 100644 index 00000000..e315c544 --- /dev/null +++ b/src/routines/level2/xtbmv.cpp @@ -0,0 +1,82 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xtbmv.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xtbmv::Xtbmv(Queue &queue, EventPointer event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xtbmv::DoTbmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Creates a copy of X: a temporary scratch buffer + auto scratch_buffer = Buffer(context_, n*x_inc + x_offset); + try { + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + } catch (...) { } // Continues: error-code is returned in MatVec + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Adds '2' to the parameter if the diagonal is unit + auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_TBMV define. + auto fast_kernels = false; + auto status = MatVec(layout, a_transpose, + n, n, static_cast(1), + a_buffer, a_offset, a_ld, + scratch_buffer, x_offset, x_inc, static_cast(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, false, k, 0); + + // Returns the proper error code (renames vector Y to X) + switch(status) { + case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; + case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; + case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; + default: return status; + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtbmv; +template class Xtbmv; +template class Xtbmv; +template class Xtbmv; +template class Xtbmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc deleted file mode 100644 index 46811089..00000000 --- a/src/routines/level2/xtpmv.cc +++ /dev/null @@ -1,82 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xtpmv class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xtpmv.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xtpmv::Xtpmv(Queue &queue, EventPointer event, const std::string &name): - Xgemv(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xtpmv::DoTpmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const Buffer &ap_buffer, const size_t ap_offset, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { - - // Creates a copy of X: a temporary scratch buffer - auto scratch_buffer = Buffer(context_, n*x_inc + x_offset); - try { - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); - } catch (...) { } // Continues: error-code is returned in MatVec - - // The data is either in the upper or lower triangle - size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - - // Adds '2' to the parameter if the diagonal is unit - auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; - - // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. - // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the - // ROUTINE_TPMV define. - auto fast_kernels = false; - auto status = MatVec(layout, a_transpose, - n, n, static_cast(1), - ap_buffer, ap_offset, n, - scratch_buffer, x_offset, x_inc, static_cast(0), - x_buffer, x_offset, x_inc, - fast_kernels, fast_kernels, - parameter, true, 0, 0); - - // Returns the proper error code (renames vector Y to X) - switch(status) { - case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; - case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; - case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; - default: return status; - } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xtpmv; -template class Xtpmv; -template class Xtpmv; -template class Xtpmv; -template class Xtpmv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xtpmv.cpp b/src/routines/level2/xtpmv.cpp new file mode 100644 index 00000000..46811089 --- /dev/null +++ b/src/routines/level2/xtpmv.cpp @@ -0,0 +1,82 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtpmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xtpmv.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xtpmv::Xtpmv(Queue &queue, EventPointer event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xtpmv::DoTpmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &ap_buffer, const size_t ap_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Creates a copy of X: a temporary scratch buffer + auto scratch_buffer = Buffer(context_, n*x_inc + x_offset); + try { + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + } catch (...) { } // Continues: error-code is returned in MatVec + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Adds '2' to the parameter if the diagonal is unit + auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_TPMV define. + auto fast_kernels = false; + auto status = MatVec(layout, a_transpose, + n, n, static_cast(1), + ap_buffer, ap_offset, n, + scratch_buffer, x_offset, x_inc, static_cast(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, true, 0, 0); + + // Returns the proper error code (renames vector Y to X) + switch(status) { + case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; + case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; + case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; + default: return status; + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtpmv; +template class Xtpmv; +template class Xtpmv; +template class Xtpmv; +template class Xtpmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc deleted file mode 100644 index d2f24252..00000000 --- a/src/routines/level2/xtrmv.cc +++ /dev/null @@ -1,82 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xtrmv class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level2/xtrmv.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xtrmv::Xtrmv(Queue &queue, EventPointer event, const std::string &name): - Xgemv(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xtrmv::DoTrmv(const Layout layout, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { - - // Creates a copy of X: a temporary scratch buffer - auto scratch_buffer = Buffer(context_, n*x_inc + x_offset); - try { - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); - } catch (...) { } // Continues: error-code is returned in MatVec - - // The data is either in the upper or lower triangle - size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - - // Adds '2' to the parameter if the diagonal is unit - auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; - - // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. - // The specific triangular matrix-accesses are implemented in the kernel guarded by the - // ROUTINE_TRMV define. - auto fast_kernels = false; - auto status = MatVec(layout, a_transpose, - n, n, static_cast(1), - a_buffer, a_offset, a_ld, - scratch_buffer, x_offset, x_inc, static_cast(0), - x_buffer, x_offset, x_inc, - fast_kernels, fast_kernels, - parameter, false, 0, 0); - - // Returns the proper error code (renames vector Y to X) - switch(status) { - case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; - case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; - case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; - default: return status; - } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xtrmv; -template class Xtrmv; -template class Xtrmv; -template class Xtrmv; -template class Xtrmv; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level2/xtrmv.cpp b/src/routines/level2/xtrmv.cpp new file mode 100644 index 00000000..d2f24252 --- /dev/null +++ b/src/routines/level2/xtrmv.cpp @@ -0,0 +1,82 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xtrmv.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xtrmv::Xtrmv(Queue &queue, EventPointer event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xtrmv::DoTrmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Creates a copy of X: a temporary scratch buffer + auto scratch_buffer = Buffer(context_, n*x_inc + x_offset); + try { + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + } catch (...) { } // Continues: error-code is returned in MatVec + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Adds '2' to the parameter if the diagonal is unit + auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific triangular matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_TRMV define. + auto fast_kernels = false; + auto status = MatVec(layout, a_transpose, + n, n, static_cast(1), + a_buffer, a_offset, a_ld, + scratch_buffer, x_offset, x_inc, static_cast(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, false, 0, 0); + + // Returns the proper error code (renames vector Y to X) + switch(status) { + case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; + case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; + case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; + default: return status; + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtrmv; +template class Xtrmv; +template class Xtrmv; +template class Xtrmv; +template class Xtrmv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc deleted file mode 100644 index 9ea5559c..00000000 --- a/src/routines/level3/xgemm.cc +++ /dev/null @@ -1,223 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xgemm class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level3/xgemm.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level3/level3.opencl" - #include "../../kernels/level3/copy_fast.opencl" - #include "../../kernels/level3/copy_pad.opencl" - #include "../../kernels/level3/transpose_fast.opencl" - #include "../../kernels/level3/transpose_pad.opencl" - #include "../../kernels/level3/convert_symmetric.opencl" - #include "../../kernels/level3/convert_triangular.opencl" - #include "../../kernels/level3/convert_hermitian.opencl" - #include "../../kernels/level3/xgemm_part1.opencl" - #include "../../kernels/level3/xgemm_part2.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xgemm::DoGemm(const Layout layout, - const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. Note - // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of - // col-major) to be transformed, so transposing requirements are not the same as whether or not - // the matrix is actually transposed in memory. - const auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); - const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); - const auto c_rotated = (layout == Layout::kRowMajor); - const auto a_do_transpose = a_rotated; - const auto b_do_transpose = !b_rotated; - const auto c_do_transpose = c_rotated; - - // In case of complex data-types, the transpose can also become a conjugate transpose - const auto a_conjugate = (a_transpose == Transpose::kConjugate); - const auto b_conjugate = (b_transpose == Transpose::kConjugate); - - // Computes the first and second dimensions of the 3 matrices taking into account whether the - // matrices are rotated or not - const auto a_one = (a_rotated) ? k : m; - const auto a_two = (a_rotated) ? m : k; - const auto b_one = (b_rotated) ? n : k; - const auto b_two = (b_rotated) ? k : n; - const auto c_one = (c_rotated) ? n : m; - const auto c_two = (c_rotated) ? m : n; - - // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than K when rotated, or less than M when not-rotated - // matrix B cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N when rotated, or less than M when not-rotated - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of m, n, and k - const auto m_ceiled = Ceil(m, db_["MWG"]); - const auto n_ceiled = Ceil(n, db_["NWG"]); - const auto k_ceiled = Ceil(k, db_["KWG"]); - - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 && - a_do_transpose == false && a_conjugate == false; - auto b_no_temp = b_one == n_ceiled && b_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && - b_do_transpose == false && b_conjugate == false; - auto c_no_temp = c_one == m_ceiled && c_two == n_ceiled && c_ld == m_ceiled && c_offset == 0 && - c_do_transpose == false; - - // Creates the temporary matrices - const auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*m_ceiled); - const auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); - const auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, m_ceiled*n_ceiled); - - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector(); - auto emptyEventList = std::vector(); - - // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. - if (!a_no_temp) { - auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - m_ceiled, k_ceiled, m_ceiled, 0, a_temp, - ConstantOne(), program, - true, a_do_transpose, a_conjugate); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessA); - } - - // As above, but now for matrix B - if (!b_no_temp) { - auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, - b_one, b_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne(), program, - true, b_do_transpose, b_conjugate); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessB); - } - - // As above, but now for matrix C. This is only necessary if C is used both as input and output. - if (!c_no_temp && beta != static_cast(0)) { - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, - c_one, c_two, c_ld, c_offset, c_buffer, - m_ceiled, n_ceiled, m_ceiled, 0, c_temp, - ConstantOne(), program, - true, c_do_transpose, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessC); - } - - // Retrieves the Xgemm kernel from the compiled binary - try { - auto kernel = Kernel(program, "Xgemm"); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(m_ceiled)); - kernel.SetArgument(1, static_cast(n_ceiled)); - kernel.SetArgument(2, static_cast(k_ceiled)); - kernel.SetArgument(3, alpha_buffer()); - kernel.SetArgument(4, beta_buffer()); - kernel.SetArgument(5, a_temp()); - kernel.SetArgument(6, b_temp()); - kernel.SetArgument(7, c_temp()); - - // Computes the global and local thread sizes - const auto global = std::vector{ - (m_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - const auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel = Event(); - auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; - status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList); - if (ErrorIn(status)) { return status; } - - // Runs the post-processing kernel if needed - if (!c_no_temp) { - eventWaitList.push_back(eventKernel); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, - m_ceiled, n_ceiled, m_ceiled, 0, c_temp, - c_one, c_two, c_ld, c_offset, c_buffer, - ConstantOne(), program, - false, c_do_transpose, false); - if (ErrorIn(status)) { return status; } - } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xgemm; -template class Xgemm; -template class Xgemm; -template class Xgemm; -template class Xgemm; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp new file mode 100644 index 00000000..9ea5559c --- /dev/null +++ b/src/routines/level3/xgemm.cpp @@ -0,0 +1,223 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xgemm class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level3/xgemm.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" + #include "../../kernels/level3/convert_symmetric.opencl" + #include "../../kernels/level3/convert_triangular.opencl" + #include "../../kernels/level3/convert_hermitian.opencl" + #include "../../kernels/level3/xgemm_part1.opencl" + #include "../../kernels/level3/xgemm_part2.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xgemm::DoGemm(const Layout layout, + const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. Note + // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of + // col-major) to be transformed, so transposing requirements are not the same as whether or not + // the matrix is actually transposed in memory. + const auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); + const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); + const auto c_rotated = (layout == Layout::kRowMajor); + const auto a_do_transpose = a_rotated; + const auto b_do_transpose = !b_rotated; + const auto c_do_transpose = c_rotated; + + // In case of complex data-types, the transpose can also become a conjugate transpose + const auto a_conjugate = (a_transpose == Transpose::kConjugate); + const auto b_conjugate = (b_transpose == Transpose::kConjugate); + + // Computes the first and second dimensions of the 3 matrices taking into account whether the + // matrices are rotated or not + const auto a_one = (a_rotated) ? k : m; + const auto a_two = (a_rotated) ? m : k; + const auto b_one = (b_rotated) ? n : k; + const auto b_two = (b_rotated) ? k : n; + const auto c_one = (c_rotated) ? n : m; + const auto c_two = (c_rotated) ? m : n; + + // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than K when rotated, or less than M when not-rotated + // matrix B cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N when rotated, or less than M when not-rotated + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of m, n, and k + const auto m_ceiled = Ceil(m, db_["MWG"]); + const auto n_ceiled = Ceil(n, db_["NWG"]); + const auto k_ceiled = Ceil(k, db_["KWG"]); + + // The padded/transposed input/output matrices: if memory allocation fails, throw an exception + try { + + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 && + a_do_transpose == false && a_conjugate == false; + auto b_no_temp = b_one == n_ceiled && b_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && + b_do_transpose == false && b_conjugate == false; + auto c_no_temp = c_one == m_ceiled && c_two == n_ceiled && c_ld == m_ceiled && c_offset == 0 && + c_do_transpose == false; + + // Creates the temporary matrices + const auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*m_ceiled); + const auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); + const auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, m_ceiled*n_ceiled); + + // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + auto beta_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + beta_buffer.Write(queue_, 1, &beta); + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector(); + auto emptyEventList = std::vector(); + + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. + if (!a_no_temp) { + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + m_ceiled, k_ceiled, m_ceiled, 0, a_temp, + ConstantOne(), program, + true, a_do_transpose, a_conjugate); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessA); + } + + // As above, but now for matrix B + if (!b_no_temp) { + auto eventProcessB = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, + b_one, b_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b_temp, + ConstantOne(), program, + true, b_do_transpose, b_conjugate); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessB); + } + + // As above, but now for matrix C. This is only necessary if C is used both as input and output. + if (!c_no_temp && beta != static_cast(0)) { + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, + c_one, c_two, c_ld, c_offset, c_buffer, + m_ceiled, n_ceiled, m_ceiled, 0, c_temp, + ConstantOne(), program, + true, c_do_transpose, false); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessC); + } + + // Retrieves the Xgemm kernel from the compiled binary + try { + auto kernel = Kernel(program, "Xgemm"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(m_ceiled)); + kernel.SetArgument(1, static_cast(n_ceiled)); + kernel.SetArgument(2, static_cast(k_ceiled)); + kernel.SetArgument(3, alpha_buffer()); + kernel.SetArgument(4, beta_buffer()); + kernel.SetArgument(5, a_temp()); + kernel.SetArgument(6, b_temp()); + kernel.SetArgument(7, c_temp()); + + // Computes the global and local thread sizes + const auto global = std::vector{ + (m_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + const auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel = Event(); + auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; + status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel if needed + if (!c_no_temp) { + eventWaitList.push_back(eventKernel); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, + m_ceiled, n_ceiled, m_ceiled, 0, c_temp, + c_one, c_two, c_ld, c_offset, c_buffer, + ConstantOne(), program, + false, c_do_transpose, false); + if (ErrorIn(status)) { return status; } + } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgemm; +template class Xgemm; +template class Xgemm; +template class Xgemm; +template class Xgemm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc deleted file mode 100644 index 9813503e..00000000 --- a/src/routines/level3/xhemm.cc +++ /dev/null @@ -1,134 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xhemm class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level3/xhemm.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xhemm::Xhemm(Queue &queue, EventPointer event, const std::string &name): - Xgemm(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xhemm::DoHemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } - - // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the - // left) or B (on the right) in the Xgemm routine. - auto k = (side == Side::kLeft) ? m : n; - - // Checks for validity of the squared A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared"; - - // Temporary buffer for a copy of the hermitian matrix - try { - auto temp_herm = Buffer(context_, k*k); - - // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm - // routine afterwards - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the hermitian-to-squared kernel - kernel.SetArgument(0, static_cast(k)); - kernel.SetArgument(1, static_cast(a_ld)); - kernel.SetArgument(2, static_cast(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast(k)); - kernel.SetArgument(5, static_cast(k)); - kernel.SetArgument(6, static_cast(0)); - kernel.SetArgument(7, temp_herm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // hermitian-to-squared kernel uses the same parameters. - auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - - // Synchronize now: 'DoGemm' does not accept a list of events to wait for - kernelEvent.WaitForCompletion(); - - // Runs the regular Xgemm code with either "C := AB+C" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - temp_herm, 0, k, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld); - } - - // ... with "C := BA+C". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_herm, 0, k, - beta, - c_buffer, c_offset, c_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } - } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xhemm; -template class Xhemm; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level3/xhemm.cpp b/src/routines/level3/xhemm.cpp new file mode 100644 index 00000000..9813503e --- /dev/null +++ b/src/routines/level3/xhemm.cpp @@ -0,0 +1,134 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xhemm class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level3/xhemm.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xhemm::Xhemm(Queue &queue, EventPointer event, const std::string &name): + Xgemm(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xhemm::DoHemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the + // left) or B (on the right) in the Xgemm routine. + auto k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the squared A matrix + auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld); + if (ErrorIn(status)) { return status; } + + // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as + // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix + bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared"; + + // Temporary buffer for a copy of the hermitian matrix + try { + auto temp_herm = Buffer(context_, k*k); + + // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm + // routine afterwards + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the hermitian-to-squared kernel + kernel.SetArgument(0, static_cast(k)); + kernel.SetArgument(1, static_cast(a_ld)); + kernel.SetArgument(2, static_cast(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast(k)); + kernel.SetArgument(5, static_cast(k)); + kernel.SetArgument(6, static_cast(0)); + kernel.SetArgument(7, temp_herm()); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // hermitian-to-squared kernel uses the same parameters. + auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + auto kernelEvent = Event(); + status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); + if (ErrorIn(status)) { return status; } + + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + + // Runs the regular Xgemm code with either "C := AB+C" or ... + if (side == Side::kLeft) { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + temp_herm, 0, k, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld); + } + + // ... with "C := BA+C". Note that A and B are now reversed. + else { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_herm, 0, k, + beta, + c_buffer, c_offset, c_ld); + + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(status) { + case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; + case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; + case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; + case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; + case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; + case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; + } + } + + // Return the status of the Xgemm routine + return status; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhemm; +template class Xhemm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc deleted file mode 100644 index bd7a053e..00000000 --- a/src/routines/level3/xher2k.cc +++ /dev/null @@ -1,241 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xher2k class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level3/xher2k.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xher2k::Xher2k(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level3/level3.opencl" - #include "../../kernels/level3/copy_fast.opencl" - #include "../../kernels/level3/copy_pad.opencl" - #include "../../kernels/level3/transpose_fast.opencl" - #include "../../kernels/level3/transpose_pad.opencl" - #include "../../kernels/level3/xgemm_part1.opencl" - #include "../../kernels/level3/xgemm_part2.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } - - // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or - // to matrix A (argument: conjugate transpose) - auto ab_conjugate = (ab_transpose != Transpose::kNo); - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. - auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) || - (layout == Layout::kRowMajor && !ab_conjugate); - auto c_rotated = (layout == Layout::kRowMajor); - - // Computes the first and second dimensions of the A and B matrices taking the layout into account - auto ab_one = (ab_rotated) ? k : n; - auto ab_two = (ab_rotated) ? n : k; - - // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than K when not-rotated - // matrix B cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N - auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Decides which kernel to run: the upper-triangular or lower-triangular version - auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - ab_rotated == false && ab_conjugate == false; - auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - ab_rotated == false && ab_conjugate == true; - auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && - ab_rotated == false && ab_conjugate == false; - auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && - ab_rotated == false && ab_conjugate == true; - - // Creates the temporary matrices - auto a1_temp = (a1_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); - auto a2_temp = (a2_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); - auto b1_temp = (b1_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); - auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); - auto c_temp = Buffer(context_, n_ceiled*n_ceiled); - - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto complex_beta = T{beta, static_cast(0.0)}; - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &complex_beta); - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector(); - auto emptyEventList = std::vector(); - - // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. - if (!a1_no_temp) { - auto eventProcessA1 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA1.pointer(), emptyEventList, - ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, - ConstantOne(), program, - true, ab_rotated, ab_conjugate); - eventWaitList.push_back(eventProcessA1); - if (ErrorIn(status)) { return status; } - } - if (!a2_no_temp) { - auto eventProcessA2 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA2.pointer(), emptyEventList, - ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, - ConstantOne(), program, - true, ab_rotated, !ab_conjugate); - eventWaitList.push_back(eventProcessA2); - if (ErrorIn(status)) { return status; } - } - if (!b1_no_temp) { - auto eventProcessB1 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB1.pointer(), emptyEventList, - ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, - ConstantOne(), program, - true, ab_rotated, ab_conjugate); - eventWaitList.push_back(eventProcessB1); - if (ErrorIn(status)) { return status; } - } - if (!b2_no_temp) { - auto eventProcessB2 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB2.pointer(), emptyEventList, - ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, - ConstantOne(), program, - true, ab_rotated, !ab_conjugate); - eventWaitList.push_back(eventProcessB2); - if (ErrorIn(status)) { return status; } - } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, - n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne(), program, - true, c_rotated, false); - eventWaitList.push_back(eventProcessC); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(n_ceiled)); - kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); - kernel.SetArgument(4, a1_temp()); - kernel.SetArgument(5, b2_temp()); - kernel.SetArgument(6, c_temp()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel1 = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel1); - - // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha - auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; - auto complex_one = T{static_cast(1.0), static_cast(0.0)}; - alpha_buffer.Write(queue_, 1, &conjugate_alpha); - beta_buffer.Write(queue_, 1, &complex_one); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); - kernel.SetArgument(4, b1_temp()); - kernel.SetArgument(5, a2_temp()); - - // Runs the kernel again - auto eventKernel2 = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel2); - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - n, n, c_ld, c_offset, c_buffer, - ConstantOne(), program, - false, c_rotated, false, upper, lower, true); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xher2k; -template class Xher2k; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp new file mode 100644 index 00000000..bd7a053e --- /dev/null +++ b/src/routines/level3/xher2k.cpp @@ -0,0 +1,241 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xher2k class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level3/xher2k.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xher2k::Xher2k(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" + #include "../../kernels/level3/xgemm_part1.opencl" + #include "../../kernels/level3/xgemm_part2.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or + // to matrix A (argument: conjugate transpose) + auto ab_conjugate = (ab_transpose != Transpose::kNo); + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) || + (layout == Layout::kRowMajor && !ab_conjugate); + auto c_rotated = (layout == Layout::kRowMajor); + + // Computes the first and second dimensions of the A and B matrices taking the layout into account + auto ab_one = (ab_rotated) ? k : n; + auto ab_two = (ab_rotated) ? n : k; + + // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix B cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // The padded/transposed input/output matrices: if memory allocation fails, throw an exception + try { + + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + ab_rotated == false && ab_conjugate == false; + auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + ab_rotated == false && ab_conjugate == true; + auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && + ab_rotated == false && ab_conjugate == false; + auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && + ab_rotated == false && ab_conjugate == true; + + // Creates the temporary matrices + auto a1_temp = (a1_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto a2_temp = (a2_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto b1_temp = (b1_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer(context_, n_ceiled*n_ceiled); + + // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + auto complex_beta = T{beta, static_cast(0.0)}; + auto alpha_buffer = Buffer(context_, 1); + auto beta_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + beta_buffer.Write(queue_, 1, &complex_beta); + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector(); + auto emptyEventList = std::vector(); + + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. + if (!a1_no_temp) { + auto eventProcessA1 = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA1.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, + ConstantOne(), program, + true, ab_rotated, ab_conjugate); + eventWaitList.push_back(eventProcessA1); + if (ErrorIn(status)) { return status; } + } + if (!a2_no_temp) { + auto eventProcessA2 = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA2.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, + ConstantOne(), program, + true, ab_rotated, !ab_conjugate); + eventWaitList.push_back(eventProcessA2); + if (ErrorIn(status)) { return status; } + } + if (!b1_no_temp) { + auto eventProcessB1 = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB1.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, + ConstantOne(), program, + true, ab_rotated, ab_conjugate); + eventWaitList.push_back(eventProcessB1); + if (ErrorIn(status)) { return status; } + } + if (!b2_no_temp) { + auto eventProcessB2 = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB2.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, + ConstantOne(), program, + true, ab_rotated, !ab_conjugate); + eventWaitList.push_back(eventProcessB2); + if (ErrorIn(status)) { return status; } + } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + ConstantOne(), program, + true, c_rotated, false); + eventWaitList.push_back(eventProcessC); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(4, a1_temp()); + kernel.SetArgument(5, b2_temp()); + kernel.SetArgument(6, c_temp()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel1 = Event(); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel1); + + // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha + auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; + auto complex_one = T{static_cast(1.0), static_cast(0.0)}; + alpha_buffer.Write(queue_, 1, &conjugate_alpha); + beta_buffer.Write(queue_, 1, &complex_one); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(4, b1_temp()); + kernel.SetArgument(5, a2_temp()); + + // Runs the kernel again + auto eventKernel2 = Event(); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel2); + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + n, n, c_ld, c_offset, c_buffer, + ConstantOne(), program, + false, c_rotated, false, upper, lower, true); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xher2k; +template class Xher2k; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc deleted file mode 100644 index 6ef7f21f..00000000 --- a/src/routines/level3/xherk.cc +++ /dev/null @@ -1,197 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xherk class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level3/xherk.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xherk::Xherk(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level3/level3.opencl" - #include "../../kernels/level3/copy_fast.opencl" - #include "../../kernels/level3/copy_pad.opencl" - #include "../../kernels/level3/transpose_fast.opencl" - #include "../../kernels/level3/transpose_pad.opencl" - #include "../../kernels/level3/xgemm_part1.opencl" - #include "../../kernels/level3/xgemm_part2.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const U alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } - - // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or - // to matrix A (argument: conjugate transpose) - auto a_conjugate = (a_transpose != Transpose::kNo); - auto b_conjugate = (a_transpose == Transpose::kNo); - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. - auto a_rotated = (layout == Layout::kColMajor && a_conjugate) || - (layout == Layout::kRowMajor && !a_conjugate); - auto c_rotated = (layout == Layout::kRowMajor); - - // Computes the first and second dimensions of the A matrix taking the layout into account - auto a_one = (a_rotated) ? k : n; - auto a_two = (a_rotated) ? n : k; - - // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Decides which kernel to run: the upper-triangular or lower-triangular version - auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - a_rotated == false && a_conjugate == false; - auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - a_rotated == false && b_conjugate == false; - - // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); - auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); - auto c_temp = Buffer(context_, n_ceiled*n_ceiled); - - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto complex_alpha = T{alpha, static_cast(0.0)}; - auto complex_beta = T{beta, static_cast(0.0)}; - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &complex_alpha); - beta_buffer.Write(queue_, 1, &complex_beta); - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector(); - auto emptyEventList = std::vector(); - - // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. Two copies are created. - if (!a_no_temp) { - auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne(), program, - true, a_rotated, a_conjugate); - eventWaitList.push_back(eventProcessA); - if (ErrorIn(status)) { return status; } - } - if (!b_no_temp) { - auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne(), program, - true, a_rotated, b_conjugate); - eventWaitList.push_back(eventProcessB); - if (ErrorIn(status)) { return status; } - } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, - n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne(), program, - true, c_rotated, false); - eventWaitList.push_back(eventProcessC); - if (ErrorIn(status)) { return status; } - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(n_ceiled)); - kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); - kernel.SetArgument(4, a_temp()); - kernel.SetArgument(5, b_temp()); - kernel.SetArgument(6, c_temp()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel); - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - n, n, c_ld, c_offset, c_buffer, - ConstantOne(), program, - false, c_rotated, false, upper, lower, true); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xherk; -template class Xherk; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp new file mode 100644 index 00000000..6ef7f21f --- /dev/null +++ b/src/routines/level3/xherk.cpp @@ -0,0 +1,197 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xherk class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level3/xherk.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xherk::Xherk(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" + #include "../../kernels/level3/xgemm_part1.opencl" + #include "../../kernels/level3/xgemm_part2.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const U alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const U beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or + // to matrix A (argument: conjugate transpose) + auto a_conjugate = (a_transpose != Transpose::kNo); + auto b_conjugate = (a_transpose == Transpose::kNo); + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto a_rotated = (layout == Layout::kColMajor && a_conjugate) || + (layout == Layout::kRowMajor && !a_conjugate); + auto c_rotated = (layout == Layout::kRowMajor); + + // Computes the first and second dimensions of the A matrix taking the layout into account + auto a_one = (a_rotated) ? k : n; + auto a_two = (a_rotated) ? n : k; + + // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // The padded/transposed input/output matrices: if memory allocation fails, throw an exception + try { + + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + a_rotated == false && a_conjugate == false; + auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + a_rotated == false && b_conjugate == false; + + // Creates the temporary matrices + auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer(context_, n_ceiled*n_ceiled); + + // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + auto complex_alpha = T{alpha, static_cast(0.0)}; + auto complex_beta = T{beta, static_cast(0.0)}; + auto alpha_buffer = Buffer(context_, 1); + auto beta_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &complex_alpha); + beta_buffer.Write(queue_, 1, &complex_beta); + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector(); + auto emptyEventList = std::vector(); + + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. Two copies are created. + if (!a_no_temp) { + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a_temp, + ConstantOne(), program, + true, a_rotated, a_conjugate); + eventWaitList.push_back(eventProcessA); + if (ErrorIn(status)) { return status; } + } + if (!b_no_temp) { + auto eventProcessB = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b_temp, + ConstantOne(), program, + true, a_rotated, b_conjugate); + eventWaitList.push_back(eventProcessB); + if (ErrorIn(status)) { return status; } + } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + ConstantOne(), program, + true, c_rotated, false); + eventWaitList.push_back(eventProcessC); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(4, a_temp()); + kernel.SetArgument(5, b_temp()); + kernel.SetArgument(6, c_temp()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel = Event(); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel); + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + n, n, c_ld, c_offset, c_buffer, + ConstantOne(), program, + false, c_rotated, false, upper, lower, true); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xherk; +template class Xherk; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc deleted file mode 100644 index 04e4b718..00000000 --- a/src/routines/level3/xsymm.cc +++ /dev/null @@ -1,137 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsymm class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level3/xsymm.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsymm::Xsymm(Queue &queue, EventPointer event, const std::string &name): - Xgemm(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsymm::DoSymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } - - // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the - // left) or B (on the right) in the Xgemm routine. - auto k = (side == Side::kLeft) ? m : n; - - // Checks for validity of the squared A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; - - // Temporary buffer for a copy of the symmetric matrix - try { - auto temp_symm = Buffer(context_, k*k); - - // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm - // routine afterwards - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the symmetric-to-squared kernel - kernel.SetArgument(0, static_cast(k)); - kernel.SetArgument(1, static_cast(a_ld)); - kernel.SetArgument(2, static_cast(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast(k)); - kernel.SetArgument(5, static_cast(k)); - kernel.SetArgument(6, static_cast(0)); - kernel.SetArgument(7, temp_symm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // symmetric-to-squared kernel uses the same parameters. - auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - - // Synchronize now: 'DoGemm' does not accept a list of events to wait for - kernelEvent.WaitForCompletion(); - - // Runs the regular Xgemm code with either "C := AB+C" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - temp_symm, 0, k, - b_buffer, b_offset, b_ld, - beta, - c_buffer, c_offset, c_ld); - } - - // ... with "C := BA+C". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, Transpose::kNo, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_symm, 0, k, - beta, - c_buffer, c_offset, c_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } - } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsymm; -template class Xsymm; -template class Xsymm; -template class Xsymm; -template class Xsymm; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level3/xsymm.cpp b/src/routines/level3/xsymm.cpp new file mode 100644 index 00000000..04e4b718 --- /dev/null +++ b/src/routines/level3/xsymm.cpp @@ -0,0 +1,137 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsymm class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level3/xsymm.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsymm::Xsymm(Queue &queue, EventPointer event, const std::string &name): + Xgemm(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsymm::DoSymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the + // left) or B (on the right) in the Xgemm routine. + auto k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the squared A matrix + auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld); + if (ErrorIn(status)) { return status; } + + // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as + // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix + bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; + + // Temporary buffer for a copy of the symmetric matrix + try { + auto temp_symm = Buffer(context_, k*k); + + // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm + // routine afterwards + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the symmetric-to-squared kernel + kernel.SetArgument(0, static_cast(k)); + kernel.SetArgument(1, static_cast(a_ld)); + kernel.SetArgument(2, static_cast(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast(k)); + kernel.SetArgument(5, static_cast(k)); + kernel.SetArgument(6, static_cast(0)); + kernel.SetArgument(7, temp_symm()); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // symmetric-to-squared kernel uses the same parameters. + auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + auto kernelEvent = Event(); + status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); + if (ErrorIn(status)) { return status; } + + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + + // Runs the regular Xgemm code with either "C := AB+C" or ... + if (side == Side::kLeft) { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + temp_symm, 0, k, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld); + } + + // ... with "C := BA+C". Note that A and B are now reversed. + else { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_symm, 0, k, + beta, + c_buffer, c_offset, c_ld); + + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(status) { + case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; + case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; + case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; + case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; + case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; + case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; + } + } + + // Return the status of the Xgemm routine + return status; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsymm; +template class Xsymm; +template class Xsymm; +template class Xsymm; +template class Xsymm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc deleted file mode 100644 index 424d4d2d..00000000 --- a/src/routines/level3/xsyr2k.cc +++ /dev/null @@ -1,210 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsyr2k class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level3/xsyr2k.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsyr2k::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level3/level3.opencl" - #include "../../kernels/level3/copy_fast.opencl" - #include "../../kernels/level3/copy_pad.opencl" - #include "../../kernels/level3/transpose_fast.opencl" - #include "../../kernels/level3/transpose_pad.opencl" - #include "../../kernels/level3/xgemm_part1.opencl" - #include "../../kernels/level3/xgemm_part2.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. - auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo); - auto c_rotated = (layout == Layout::kRowMajor); - - // Computes the first and second dimensions of the A and B matrices taking the layout into account - auto ab_one = (ab_rotated) ? k : n; - auto ab_two = (ab_rotated) ? n : k; - - // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than K when not-rotated - // matrix B cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N - auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Decides which kernel to run: the upper-triangular or lower-triangular version - auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - ab_rotated == false; - auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && - ab_rotated == false; - - // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); - auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); - auto c_temp = Buffer(context_, n_ceiled*n_ceiled); - - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector(); - auto emptyEventList = std::vector(); - - // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. - if (!a_no_temp) { - auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, - ab_one, ab_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne(), program, - true, ab_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessA); - } - if (!b_no_temp) { - auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, - ab_one, ab_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne(), program, - true, ab_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessB); - } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, - n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne(), program, - true, c_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessC); - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(n_ceiled)); - kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); - kernel.SetArgument(4, a_temp()); - kernel.SetArgument(5, b_temp()); - kernel.SetArgument(6, c_temp()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel1 = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel1); - - // Swaps the arguments for matrices A and B, and sets 'beta' to 1 - auto one = static_cast(1); - beta_buffer.Write(queue_, 1, &one); - kernel.SetArgument(3, beta_buffer()); - kernel.SetArgument(4, b_temp()); - kernel.SetArgument(5, a_temp()); - - // Runs the kernel again - auto eventKernel2 = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel2); - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - n, n, c_ld, c_offset, c_buffer, - ConstantOne(), program, - false, c_rotated, false, upper, lower, false); - if (ErrorIn(status)) { return status; } - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsyr2k; -template class Xsyr2k; -template class Xsyr2k; -template class Xsyr2k; -template class Xsyr2k; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp new file mode 100644 index 00000000..424d4d2d --- /dev/null +++ b/src/routines/level3/xsyr2k.cpp @@ -0,0 +1,210 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2k class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level3/xsyr2k.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsyr2k::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" + #include "../../kernels/level3/xgemm_part1.opencl" + #include "../../kernels/level3/xgemm_part2.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + + // Computes the first and second dimensions of the A and B matrices taking the layout into account + auto ab_one = (ab_rotated) ? k : n; + auto ab_two = (ab_rotated) ? n : k; + + // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix B cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // The padded/transposed input/output matrices: if memory allocation fails, throw an exception + try { + + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + ab_rotated == false; + auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && + ab_rotated == false; + + // Creates the temporary matrices + auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer(context_, n_ceiled*n_ceiled); + + // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + auto beta_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + beta_buffer.Write(queue_, 1, &beta); + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector(); + auto emptyEventList = std::vector(); + + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. + if (!a_no_temp) { + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a_temp, + ConstantOne(), program, + true, ab_rotated, false); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessA); + } + if (!b_no_temp) { + auto eventProcessB = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, b_temp, + ConstantOne(), program, + true, ab_rotated, false); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessB); + } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + ConstantOne(), program, + true, c_rotated, false); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessC); + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(4, a_temp()); + kernel.SetArgument(5, b_temp()); + kernel.SetArgument(6, c_temp()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel1 = Event(); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel1); + + // Swaps the arguments for matrices A and B, and sets 'beta' to 1 + auto one = static_cast(1); + beta_buffer.Write(queue_, 1, &one); + kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(4, b_temp()); + kernel.SetArgument(5, a_temp()); + + // Runs the kernel again + auto eventKernel2 = Event(); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel2); + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + n, n, c_ld, c_offset, c_buffer, + ConstantOne(), program, + false, c_rotated, false, upper, lower, false); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyr2k; +template class Xsyr2k; +template class Xsyr2k; +template class Xsyr2k; +template class Xsyr2k; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc deleted file mode 100644 index f56c232b..00000000 --- a/src/routines/level3/xsyrk.cc +++ /dev/null @@ -1,181 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xsyrk class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level3/xsyrk.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xsyrk::Xsyrk(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level3/level3.opencl" - #include "../../kernels/level3/copy_fast.opencl" - #include "../../kernels/level3/copy_pad.opencl" - #include "../../kernels/level3/transpose_fast.opencl" - #include "../../kernels/level3/transpose_pad.opencl" - #include "../../kernels/level3/xgemm_part1.opencl" - #include "../../kernels/level3/xgemm_part2.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { - - // Makes sure all dimensions are larger than zero - if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } - - // Computes whether or not the matrices are transposed in memory. This is based on their layout - // (row or column-major) and whether or not they are requested to be pre-transposed. - auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); - auto c_rotated = (layout == Layout::kRowMajor); - - // Computes the first and second dimensions of the A matrix taking the layout into account - auto a_one = (a_rotated) ? k : n; - auto a_two = (a_rotated) ? n : k; - - // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and - // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the - // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage - // space. Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than K when not-rotated - // matrix C cannot be less than N - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); - if (ErrorIn(status)) { return status; } - - // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); - - // Decides which kernel to run: the upper-triangular or lower-triangular version - auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - - // The padded/transposed input/output matrices: if memory allocation fails, throw an exception - try { - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - - // Determines whether or not temporary matrices are needed - auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && - a_rotated == false; - - // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); - auto c_temp = Buffer(context_, n_ceiled*n_ceiled); - - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - - // Events of all kernels (including pre/post processing kernels) - auto eventWaitList = std::vector(); - auto emptyEventList = std::vector(); - - // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros - // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In - // case nothing has to be done, these kernels can be skipped. - if (!a_no_temp) { - auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne(), program, - true, a_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessA); - } - - // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to - // modify the other triangle. - auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, - n, n, c_ld, c_offset, c_buffer, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne(), program, - true, c_rotated, false); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventProcessC); - - // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - kernel.SetArgument(0, static_cast(n_ceiled)); - kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); - kernel.SetArgument(4, a_temp()); - kernel.SetArgument(5, a_temp()); - kernel.SetArgument(6, c_temp()); - - // Computes the global and local thread sizes - auto global = std::vector{ - (n_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] - }; - auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; - - // Launches the kernel - auto eventKernel = Event(); - status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); - if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel); - - // Runs the post-processing kernel - auto upper = (triangle == Triangle::kUpper); - auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, - n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - n, n, c_ld, c_offset, c_buffer, - ConstantOne(), program, - false, c_rotated, false, upper, lower, false); - if (ErrorIn(status)) { return status; } - - - // Successfully finished the computation - return StatusCode::kSuccess; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xsyrk; -template class Xsyrk; -template class Xsyrk; -template class Xsyrk; -template class Xsyrk; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp new file mode 100644 index 00000000..f56c232b --- /dev/null +++ b/src/routines/level3/xsyrk.cpp @@ -0,0 +1,181 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyrk class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level3/xsyrk.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsyrk::Xsyrk(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" + #include "../../kernels/level3/xgemm_part1.opencl" + #include "../../kernels/level3/xgemm_part2.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + + // Computes the first and second dimensions of the A matrix taking the layout into account + auto a_one = (a_rotated) ? k : n; + auto a_two = (a_rotated) ? n : k; + + // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // The padded/transposed input/output matrices: if memory allocation fails, throw an exception + try { + + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + + // Determines whether or not temporary matrices are needed + auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && + a_rotated == false; + + // Creates the temporary matrices + auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer(context_, n_ceiled*n_ceiled); + + // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + auto beta_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + beta_buffer.Write(queue_, 1, &beta); + + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector(); + auto emptyEventList = std::vector(); + + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros + // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In + // case nothing has to be done, these kernels can be skipped. + if (!a_no_temp) { + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, a_temp, + ConstantOne(), program, + true, a_rotated, false); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessA); + } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + ConstantOne(), program, + true, c_rotated, false); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessC); + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(4, a_temp()); + kernel.SetArgument(5, a_temp()); + kernel.SetArgument(6, c_temp()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + auto eventKernel = Event(); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel); + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + n, n, c_ld, c_offset, c_buffer, + ConstantOne(), program, + false, c_rotated, false, upper, lower, false); + if (ErrorIn(status)) { return status; } + + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyrk; +template class Xsyrk; +template class Xsyrk; +template class Xsyrk; +template class Xsyrk; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc deleted file mode 100644 index 74a82822..00000000 --- a/src/routines/level3/xtrmm.cc +++ /dev/null @@ -1,140 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xtrmm class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/level3/xtrmm.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xtrmm::Xtrmm(Queue &queue, EventPointer event, const std::string &name): - Xgemm(queue, event, name) { -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } - - // Computes the k dimension. This is based on whether or not matrix is A (on the left) - // or B (on the right) in the Xgemm routine. - auto k = (side == Side::kLeft) ? m : n; - - // Checks for validity of the triangular A matrix - auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "TriaUpperToSquared" : "TriaLowerToSquared"; - - // Determines whether or not the triangular matrix is unit-diagonal - auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false; - - // Temporary buffer for a copy of the triangular matrix - try { - auto temp_triangular = Buffer(context_, k*k); - - // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm - // routine afterwards - try { - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the triangular-to-squared kernel - kernel.SetArgument(0, static_cast(k)); - kernel.SetArgument(1, static_cast(a_ld)); - kernel.SetArgument(2, static_cast(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast(k)); - kernel.SetArgument(5, static_cast(k)); - kernel.SetArgument(6, static_cast(0)); - kernel.SetArgument(7, temp_triangular()); - kernel.SetArgument(8, static_cast(unit_diagonal)); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // triangular-to-squared kernel uses the same parameters. - auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - auto kernelEvent = Event(); - status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); - if (ErrorIn(status)) { return status; } - - // Synchronize now: 'DoGemm' does not accept a list of events to wait for - kernelEvent.WaitForCompletion(); - - // Runs the regular Xgemm code with either "B := alpha*A*B" or ... - if (side == Side::kLeft) { - status = DoGemm(layout, a_transpose, Transpose::kNo, - m, n, k, - alpha, - temp_triangular, 0, k, - b_buffer, b_offset, b_ld, - static_cast(0.0), - b_buffer, b_offset, b_ld); - } - - // ... with "B := alpha*B*A". Note that A and B are now reversed. - else { - status = DoGemm(layout, Transpose::kNo, a_transpose, - m, n, k, - alpha, - b_buffer, b_offset, b_ld, - temp_triangular, 0, k, - static_cast(0.0), - b_buffer, b_offset, b_ld); - - // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine - switch(status) { - case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; - case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; - case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; - case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; - case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; - case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; - } - } - - // Return the status of the Xgemm routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Xtrmm; -template class Xtrmm; -template class Xtrmm; -template class Xtrmm; -template class Xtrmm; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp new file mode 100644 index 00000000..74a82822 --- /dev/null +++ b/src/routines/level3/xtrmm.cpp @@ -0,0 +1,140 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrmm class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level3/xtrmm.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xtrmm::Xtrmm(Queue &queue, EventPointer event, const std::string &name): + Xgemm(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } + + // Computes the k dimension. This is based on whether or not matrix is A (on the left) + // or B (on the right) in the Xgemm routine. + auto k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the triangular A matrix + auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld); + if (ErrorIn(status)) { return status; } + + // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as + // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix + bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + auto kernel_name = (is_upper) ? "TriaUpperToSquared" : "TriaLowerToSquared"; + + // Determines whether or not the triangular matrix is unit-diagonal + auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false; + + // Temporary buffer for a copy of the triangular matrix + try { + auto temp_triangular = Buffer(context_, k*k); + + // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm + // routine afterwards + try { + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the triangular-to-squared kernel + kernel.SetArgument(0, static_cast(k)); + kernel.SetArgument(1, static_cast(a_ld)); + kernel.SetArgument(2, static_cast(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast(k)); + kernel.SetArgument(5, static_cast(k)); + kernel.SetArgument(6, static_cast(0)); + kernel.SetArgument(7, temp_triangular()); + kernel.SetArgument(8, static_cast(unit_diagonal)); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // triangular-to-squared kernel uses the same parameters. + auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + auto kernelEvent = Event(); + status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); + if (ErrorIn(status)) { return status; } + + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + + // Runs the regular Xgemm code with either "B := alpha*A*B" or ... + if (side == Side::kLeft) { + status = DoGemm(layout, a_transpose, Transpose::kNo, + m, n, k, + alpha, + temp_triangular, 0, k, + b_buffer, b_offset, b_ld, + static_cast(0.0), + b_buffer, b_offset, b_ld); + } + + // ... with "B := alpha*B*A". Note that A and B are now reversed. + else { + status = DoGemm(layout, Transpose::kNo, a_transpose, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_triangular, 0, k, + static_cast(0.0), + b_buffer, b_offset, b_ld); + + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(status) { + case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; + case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; + case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; + case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; + case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; + case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; + } + } + + // Return the status of the Xgemm routine + return status; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtrmm; +template class Xtrmm; +template class Xtrmm; +template class Xtrmm; +template class Xtrmm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/levelx/xomatcopy.cc b/src/routines/levelx/xomatcopy.cc deleted file mode 100644 index e8593301..00000000 --- a/src/routines/levelx/xomatcopy.cc +++ /dev/null @@ -1,94 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Xomatcopy class (see the header for information about the class). -// -// ================================================================================================= - -#include "routines/levelx/xomatcopy.hpp" - -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor: forwards to base class constructor -template -Xomatcopy::Xomatcopy(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue()) { - source_string_ = - #include "../../kernels/level3/level3.opencl" - #include "../../kernels/level3/copy_fast.opencl" - #include "../../kernels/level3/copy_pad.opencl" - #include "../../kernels/level3/transpose_fast.opencl" - #include "../../kernels/level3/transpose_pad.opencl" - ; -} - -// ================================================================================================= - -// The main routine -template -StatusCode Xomatcopy::DoOmatcopy(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { - - // Makes sure all dimensions are larger than zero - if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } - - // Determines whether to transpose the matrix A - const auto transpose = (a_transpose != Transpose::kNo); - - // In case of complex data-types, the transpose can also become a conjugate transpose - const auto conjugate = (a_transpose == Transpose::kConjugate); - - // Computes the dimensions of the two matrices - const auto rotated = (layout == Layout::kRowMajor); - const auto a_one = (rotated) ? n : m; - const auto a_two = (rotated) ? m : n; - const auto b_one = (transpose) ? a_two : a_one; - const auto b_two = (transpose) ? a_one : a_two; - - // Tests the matrices for validity, first from a perspective of the OpenCL buffers and their - // sizes, and then from a perspective of parameter values (e.g. m, n). Tests whether the OpenCL - // buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage space. - // Also tests that the leading dimensions of: - // matrix A cannot be less than N when rotated, or less than M when not-rotated - // matrix B cannot be less than M when rotated, or less than N when not-rotated - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); - if (ErrorIn(status)) { return status; } - status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); - if (ErrorIn(status)) { return status; } - - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - - auto emptyEventList = std::vector(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, emptyEventList, - a_one, a_two, a_ld, a_offset, a_buffer, - b_one, b_two, b_ld, b_offset, b_buffer, - alpha, program, false, transpose, conjugate); - if (ErrorIn(status)) { return status; } - - return StatusCode::kSuccess; -} - -// ================================================================================================= - -// Compiles the templated class -template class Xomatcopy; -template class Xomatcopy; -template class Xomatcopy; -template class Xomatcopy; -template class Xomatcopy; - -// ================================================================================================= -} // namespace clblast diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp new file mode 100644 index 00000000..e8593301 --- /dev/null +++ b/src/routines/levelx/xomatcopy.cpp @@ -0,0 +1,94 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xomatcopy class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/levelx/xomatcopy.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xomatcopy::Xomatcopy(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue()) { + source_string_ = + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xomatcopy::DoOmatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } + + // Determines whether to transpose the matrix A + const auto transpose = (a_transpose != Transpose::kNo); + + // In case of complex data-types, the transpose can also become a conjugate transpose + const auto conjugate = (a_transpose == Transpose::kConjugate); + + // Computes the dimensions of the two matrices + const auto rotated = (layout == Layout::kRowMajor); + const auto a_one = (rotated) ? n : m; + const auto a_two = (rotated) ? m : n; + const auto b_one = (transpose) ? a_two : a_one; + const auto b_two = (transpose) ? a_one : a_two; + + // Tests the matrices for validity, first from a perspective of the OpenCL buffers and their + // sizes, and then from a perspective of parameter values (e.g. m, n). Tests whether the OpenCL + // buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage space. + // Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than M when not-rotated + // matrix B cannot be less than M when rotated, or less than N when not-rotated + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); + if (ErrorIn(status)) { return status; } + + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + + auto emptyEventList = std::vector(); + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, + b_one, b_two, b_ld, b_offset, b_buffer, + alpha, program, false, transpose, conjugate); + if (ErrorIn(status)) { return status; } + + return StatusCode::kSuccess; +} + +// ================================================================================================= + +// Compiles the templated class +template class Xomatcopy; +template class Xomatcopy; +template class Xomatcopy; +template class Xomatcopy; +template class Xomatcopy; + +// ================================================================================================= +} // namespace clblast diff --git a/src/tuning/kernels/copy_fast.cc b/src/tuning/kernels/copy_fast.cc deleted file mode 100644 index 34269bc7..00000000 --- a/src/tuning/kernels/copy_fast.cc +++ /dev/null @@ -1,122 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "utilities.hpp" -#include "tuning/tuning.hpp" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneCopy { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "copy"; } - static std::string KernelName() { return "CopyMatrixFast"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/copy_fast.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32}); - tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32}); - tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8}); - tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"COPY_DIMX", "COPY_DIMY"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"COPY_VW", "COPY_WPT"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp new file mode 100644 index 00000000..34269bc7 --- /dev/null +++ b/src/tuning/kernels/copy_fast.cpp @@ -0,0 +1,122 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneCopy { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "copy"; } + static std::string KernelName() { return "CopyMatrixFast"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/copy_fast.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32}); + tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32}); + tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8}); + tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"COPY_DIMX", "COPY_DIMY"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"COPY_VW", "COPY_WPT"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentInput(alpha_buffer); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/copy_pad.cc b/src/tuning/kernels/copy_pad.cc deleted file mode 100644 index 1e0dccd3..00000000 --- a/src/tuning/kernels/copy_pad.cc +++ /dev/null @@ -1,130 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "utilities.hpp" -#include "tuning/tuning.hpp" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TunePad { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "pad"; } - static std::string KernelName() { return "CopyPadMatrix"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/copy_pad.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32}); - tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32}); - tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4}); - tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"PAD_DIMX", "PAD_DIMY"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"PAD_WPTX", "PAD_WPTY"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentScalar(0); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp new file mode 100644 index 00000000..1e0dccd3 --- /dev/null +++ b/src/tuning/kernels/copy_pad.cpp @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TunePad { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "pad"; } + static std::string KernelName() { return "CopyPadMatrix"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/copy_pad.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32}); + tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32}); + tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4}); + tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"PAD_DIMX", "PAD_DIMY"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"PAD_WPTX", "PAD_WPTY"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(0); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/transpose_fast.cc b/src/tuning/kernels/transpose_fast.cc deleted file mode 100644 index 7ac19cb6..00000000 --- a/src/tuning/kernels/transpose_fast.cc +++ /dev/null @@ -1,127 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "utilities.hpp" -#include "tuning/tuning.hpp" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneTranspose { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "transpose"; } - static std::string KernelName() { return "TransposeMatrixFast"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/transpose_fast.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64}); - tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16}); - tuner.AddParameter(id, "TRA_PAD", {0, 1}); - tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { - return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"}); - } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"TRA_DIM", "TRA_DIM"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"TRA_WPT", "TRA_WPT"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp new file mode 100644 index 00000000..7ac19cb6 --- /dev/null +++ b/src/tuning/kernels/transpose_fast.cpp @@ -0,0 +1,127 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneTranspose { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "transpose"; } + static std::string KernelName() { return "TransposeMatrixFast"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/transpose_fast.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64}); + tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16}); + tuner.AddParameter(id, "TRA_PAD", {0, 1}); + tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { + auto LocalMemorySize = [args] (std::vector v) { + return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"}); + } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"TRA_DIM", "TRA_DIM"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"TRA_WPT", "TRA_WPT"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentInput(alpha_buffer); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/transpose_pad.cc b/src/tuning/kernels/transpose_pad.cc deleted file mode 100644 index 63274415..00000000 --- a/src/tuning/kernels/transpose_pad.cc +++ /dev/null @@ -1,134 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "utilities.hpp" -#include "tuning/tuning.hpp" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TunePadTranspose { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "padtranspose"; } - static std::string KernelName() { return "TransposePadMatrix"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/transpose_pad.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64}); - tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16}); - tuner.AddParameter(id, "PADTRA_PAD", {0, 1}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { - return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"}); - } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"PADTRA_TILE", "PADTRA_TILE"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"PADTRA_WPT", "PADTRA_WPT"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentScalar(0); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp new file mode 100644 index 00000000..63274415 --- /dev/null +++ b/src/tuning/kernels/transpose_pad.cpp @@ -0,0 +1,134 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TunePadTranspose { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "padtranspose"; } + static std::string KernelName() { return "TransposePadMatrix"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/transpose_pad.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64}); + tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16}); + tuner.AddParameter(id, "PADTRA_PAD", {0, 1}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { + auto LocalMemorySize = [args] (std::vector v) { + return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"}); + } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"PADTRA_TILE", "PADTRA_TILE"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"PADTRA_WPT", "PADTRA_WPT"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(0); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/xaxpy.cc b/src/tuning/kernels/xaxpy.cc deleted file mode 100644 index 88d12c1f..00000000 --- a/src/tuning/kernels/xaxpy.cc +++ /dev/null @@ -1,125 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the xaxpy OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "utilities.hpp" -#include "tuning/tuning.hpp" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneXaxpy { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "xaxpy"; } - static std::string KernelName() { return "XaxpyFast"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level1/level1.opencl" - #include "../src/kernels/level1/xaxpy.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgN, kArgAlpha}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &args) { - if (!IsMultiple(args.n, 64)) { - throw std::runtime_error("'XaxpyFast' requires 'n' to be a multiple of WGS*WPT*VW"); - } - } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1; } // N/A for this kernel - static size_t DefaultN() { return 4096*1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.n; } - static size_t GetSizeY(const Arguments &args) { return args.n; } - static size_t GetSizeA(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 2048}); - tuner.AddParameter(id, "WPT", {1, 2, 4, 8}); - tuner.AddParameter(id, "VW", {1, 2, 4, 8}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1}; } - static std::vector LocalSizeRef() { return {64}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"WGS"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"WPT"},{"VW"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &x_vec, std::vector &y_vec, - std::vector &, std::vector &, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentOutput(y_vec); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 3 * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp new file mode 100644 index 00000000..88d12c1f --- /dev/null +++ b/src/tuning/kernels/xaxpy.cpp @@ -0,0 +1,125 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the xaxpy OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneXaxpy { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "xaxpy"; } + static std::string KernelName() { return "XaxpyFast"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level1/level1.opencl" + #include "../src/kernels/level1/xaxpy.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgN, kArgAlpha}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &args) { + if (!IsMultiple(args.n, 64)) { + throw std::runtime_error("'XaxpyFast' requires 'n' to be a multiple of WGS*WPT*VW"); + } + } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1; } // N/A for this kernel + static size_t DefaultN() { return 4096*1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { return args.n; } + static size_t GetSizeY(const Arguments &args) { return args.n; } + static size_t GetSizeA(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 2048}); + tuner.AddParameter(id, "WPT", {1, 2, 4, 8}); + tuner.AddParameter(id, "VW", {1, 2, 4, 8}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1}; } + static std::vector LocalSizeRef() { return {64}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"WGS"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"WPT"},{"VW"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &x_vec, std::vector &y_vec, + std::vector &, std::vector &, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentOutput(y_vec); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 3 * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/xdot.cc b/src/tuning/kernels/xdot.cc deleted file mode 100644 index 1581e13f..00000000 --- a/src/tuning/kernels/xdot.cc +++ /dev/null @@ -1,137 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the xdot OpenCL kernels. Note that the results are -// not verified, since the result is not final and depends on the WGS2 parameter. -// -// ================================================================================================= - -#include -#include - -#include "utilities.hpp" -#include "tuning/tuning.hpp" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneXdot { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "xdot_"+std::to_string(V); } - static std::string KernelName() { return (V==1) ? "Xdot" : "XdotEpilogue"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level1/xdot.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgN}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1; } // N/A for this kernel - static size_t DefaultN() { return 2*1024*1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.n; } - static size_t GetSizeY(const Arguments &args) { return args.n; } - static size_t GetSizeA(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &args) { return args.n; } // Worst case - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &) { return (V==1) ? std::vector{2*64} : std::vector{1}; } - static std::vector GlobalSizeRef(const Arguments &) { return (V==1) ? std::vector{2*64*64} : std::vector{64}; } - static std::vector LocalSize() { return {1}; } - static std::vector LocalSizeRef() { return {64}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } - static TransformVector DivGlobal() { return {}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &x_vec, std::vector &y_vec, - std::vector &, std::vector &, std::vector &, - std::vector &temp) { - if (V == 1) { - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentInput(y_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentInput(temp); // No output checking for the result - size varies - tuner.AddArgumentScalar(static_cast(false)); - } - else { - tuner.AddArgumentInput(temp); - tuner.AddArgumentInput(x_vec); // No output checking for the result - store somewhere - tuner.AddArgumentScalar(0); - } - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return (V==1) ? "GB/s" : "N/A"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Function to tune a specific variation V (not within the clblast namespace) -template -void StartVariation(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } -} - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - StartVariation<1>(argc, argv); - StartVariation<2>(argc, argv); - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp new file mode 100644 index 00000000..1581e13f --- /dev/null +++ b/src/tuning/kernels/xdot.cpp @@ -0,0 +1,137 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the xdot OpenCL kernels. Note that the results are +// not verified, since the result is not final and depends on the WGS2 parameter. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneXdot { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "xdot_"+std::to_string(V); } + static std::string KernelName() { return (V==1) ? "Xdot" : "XdotEpilogue"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level1/xdot.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgN}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1; } // N/A for this kernel + static size_t DefaultN() { return 2*1024*1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { return args.n; } + static size_t GetSizeY(const Arguments &args) { return args.n; } + static size_t GetSizeA(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &args) { return args.n; } // Worst case + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &) { return (V==1) ? std::vector{2*64} : std::vector{1}; } + static std::vector GlobalSizeRef(const Arguments &) { return (V==1) ? std::vector{2*64*64} : std::vector{64}; } + static std::vector LocalSize() { return {1}; } + static std::vector LocalSizeRef() { return {64}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } + static TransformVector DivGlobal() { return {}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &x_vec, std::vector &y_vec, + std::vector &, std::vector &, std::vector &, + std::vector &temp) { + if (V == 1) { + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentInput(y_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentInput(temp); // No output checking for the result - size varies + tuner.AddArgumentScalar(static_cast(false)); + } + else { + tuner.AddArgumentInput(temp); + tuner.AddArgumentInput(x_vec); // No output checking for the result - store somewhere + tuner.AddArgumentScalar(0); + } + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return (V==1) ? "GB/s" : "N/A"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Function to tune a specific variation V (not within the clblast namespace) +template +void StartVariation(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } +} + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/xgemm.cc b/src/tuning/kernels/xgemm.cc deleted file mode 100644 index 4b1efdef..00000000 --- a/src/tuning/kernels/xgemm.cc +++ /dev/null @@ -1,162 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "utilities.hpp" -#include "tuning/tuning.hpp" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneXgemm { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "xgemm"; } - static std::string KernelName() { return "Xgemm"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/xgemm_part1.opencl" - #include "../src/kernels/level3/xgemm_part2.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction}; - } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1024; } - static double DefaultFraction() { return 2048.0; } - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.k; } - static size_t GetSizeB(const Arguments &args) { return args.n * args.k; } - static size_t GetSizeC(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "KWG", {16, 32}); - tuner.AddParameter(id, "MDIMC", {8, 16, 32}); - tuner.AddParameter(id, "NDIMC", {8, 16, 32}); - tuner.AddParameter(id, "MDIMA", {8, 16, 32}); - tuner.AddParameter(id, "NDIMB", {8, 16, 32}); - tuner.AddParameter(id, "KWI", {2, 8}); - tuner.AddParameter(id, "VWM", {1, 2, 4, 8}); - tuner.AddParameter(id, "VWN", {1, 2, 4, 8}); - tuner.AddParameter(id, "STRM", {0, 1}); - tuner.AddParameter(id, "STRN", {0, 1}); - tuner.AddParameter(id, "SA", {0, 1}); - tuner.AddParameter(id, "SB", {0, 1}); - } - - // Sets the constraints - static void SetConstraints(cltune::Tuner &tuner, const size_t id) { - auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; - auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; - auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; - // Requirement for unrolling the KWG loop - tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"}); - // Required for integer MWI and NWI - tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}); - tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}); - // Required for integer MWIA and NWIB - tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}); - tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}); - // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) - tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}); - tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}); - } - - // Sets the local memory size - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { - return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM", - "SB", "KWG", "NWG", "VWN"}); - } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"MDIMC", "NDIMC"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {{"MDIMC", "NDIMC"}}; } - static TransformVector DivGlobal() { return {{"MWG", "NWG"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &c_mat, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - auto beta_buffer = std::vector{args.beta}; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.k)); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentInput(beta_buffer); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentInput(b_mat); - tuner.AddArgumentOutput(c_mat); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * args.k; - } - static std::string PerformanceUnit() { return "GFLOPS"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp new file mode 100644 index 00000000..4b1efdef --- /dev/null +++ b/src/tuning/kernels/xgemm.cpp @@ -0,0 +1,162 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneXgemm { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "xgemm"; } + static std::string KernelName() { return "Xgemm"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/xgemm_part1.opencl" + #include "../src/kernels/level3/xgemm_part2.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction}; + } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1024; } + static double DefaultFraction() { return 2048.0; } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.k; } + static size_t GetSizeB(const Arguments &args) { return args.n * args.k; } + static size_t GetSizeC(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); + tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); + tuner.AddParameter(id, "KWG", {16, 32}); + tuner.AddParameter(id, "MDIMC", {8, 16, 32}); + tuner.AddParameter(id, "NDIMC", {8, 16, 32}); + tuner.AddParameter(id, "MDIMA", {8, 16, 32}); + tuner.AddParameter(id, "NDIMB", {8, 16, 32}); + tuner.AddParameter(id, "KWI", {2, 8}); + tuner.AddParameter(id, "VWM", {1, 2, 4, 8}); + tuner.AddParameter(id, "VWN", {1, 2, 4, 8}); + tuner.AddParameter(id, "STRM", {0, 1}); + tuner.AddParameter(id, "STRN", {0, 1}); + tuner.AddParameter(id, "SA", {0, 1}); + tuner.AddParameter(id, "SB", {0, 1}); + } + + // Sets the constraints + static void SetConstraints(cltune::Tuner &tuner, const size_t id) { + auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; + auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; + auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; + // Requirement for unrolling the KWG loop + tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"}); + // Required for integer MWI and NWI + tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}); + tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}); + // Required for integer MWIA and NWIB + tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}); + tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}); + // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) + tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}); + tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}); + } + + // Sets the local memory size + static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { + auto LocalMemorySize = [args] (std::vector v) { + return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM", + "SB", "KWG", "NWG", "VWN"}); + } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"MDIMC", "NDIMC"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {{"MDIMC", "NDIMC"}}; } + static TransformVector DivGlobal() { return {{"MWG", "NWG"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &c_mat, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + auto beta_buffer = std::vector{args.beta}; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.k)); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentInput(beta_buffer); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentInput(b_mat); + tuner.AddArgumentOutput(c_mat); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * args.k; + } + static std::string PerformanceUnit() { return "GFLOPS"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/xgemv.cc b/src/tuning/kernels/xgemv.cc deleted file mode 100644 index d42155ae..00000000 --- a/src/tuning/kernels/xgemv.cc +++ /dev/null @@ -1,156 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned: -// 1: The full version of the kernel -// 2: The fast version for non-transposed matrices -// 3: The fast version for transposed matrices -// -// ================================================================================================= - -#include -#include - -#include "utilities.hpp" -#include "tuning/tuning.hpp" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneXgemv { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "xgemv_"+std::to_string(V); } - static std::string KernelName() { return (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level2/xgemv.opencl" - #include "../src/kernels/level2/xgemv_fast.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha, kArgBeta}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 2048; } - static size_t DefaultN() { return 2048; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.n; } - static size_t GetSizeY(const Arguments &args) { return args.m; } - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256}); - tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); - if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); } - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &tuner, const size_t id) { - auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; - if (V==2 || V==3) { - tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}); - } - } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { return v[0]*GetBytes(args.precision); }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)}); - } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1}; } - static std::vector LocalSizeRef() { return {64}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"WGS"+std::to_string(V)}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"WPT"+std::to_string(V)}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &x_vec, std::vector &y_vec, - std::vector &a_mat, std::vector &, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - auto beta_buffer = std::vector{args.beta}; - auto a_rotated = (V==3) ? 1 : 0; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentInput(beta_buffer); - tuner.AddArgumentScalar(static_cast(a_rotated)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentOutput(y_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentScalar(0); // Conjugate transpose - tuner.AddArgumentScalar(0); // Additional parameter - tuner.AddArgumentScalar(0); // Banded 'kl' - tuner.AddArgumentScalar(0); // Banded 'ku' - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return (args.m*args.n + 2*args.m + args.n) * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Function to tune a specific variation V (not within the clblast namespace) -template -void StartVariation(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } -} - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - StartVariation<1>(argc, argv); - StartVariation<2>(argc, argv); - StartVariation<3>(argc, argv); - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp new file mode 100644 index 00000000..d42155ae --- /dev/null +++ b/src/tuning/kernels/xgemv.cpp @@ -0,0 +1,156 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned: +// 1: The full version of the kernel +// 2: The fast version for non-transposed matrices +// 3: The fast version for transposed matrices +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneXgemv { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "xgemv_"+std::to_string(V); } + static std::string KernelName() { return (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level2/xgemv.opencl" + #include "../src/kernels/level2/xgemv_fast.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha, kArgBeta}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 2048; } + static size_t DefaultN() { return 2048; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { return args.n; } + static size_t GetSizeY(const Arguments &args) { return args.m; } + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256}); + tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); + if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); } + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &tuner, const size_t id) { + auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; + if (V==2 || V==3) { + tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}); + } + } + static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { + auto LocalMemorySize = [args] (std::vector v) { return v[0]*GetBytes(args.precision); }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)}); + } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1}; } + static std::vector LocalSizeRef() { return {64}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"WGS"+std::to_string(V)}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"WPT"+std::to_string(V)}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &x_vec, std::vector &y_vec, + std::vector &a_mat, std::vector &, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + auto beta_buffer = std::vector{args.beta}; + auto a_rotated = (V==3) ? 1 : 0; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentInput(beta_buffer); + tuner.AddArgumentScalar(static_cast(a_rotated)); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentOutput(y_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentScalar(0); // Conjugate transpose + tuner.AddArgumentScalar(0); // Additional parameter + tuner.AddArgumentScalar(0); // Banded 'kl' + tuner.AddArgumentScalar(0); // Banded 'ku' + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return (args.m*args.n + 2*args.m + args.n) * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Function to tune a specific variation V (not within the clblast namespace) +template +void StartVariation(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } +} + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); + StartVariation<3>(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/xger.cc b/src/tuning/kernels/xger.cc deleted file mode 100644 index d2590c53..00000000 --- a/src/tuning/kernels/xger.cc +++ /dev/null @@ -1,130 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the xger OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "utilities.hpp" -#include "tuning/tuning.hpp" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneXger { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "xger"; } - static std::string KernelName() { return "Xger"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level2/level2.opencl" - #include "../src/kernels/level2/xger.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgN, kArgM, kArgAlpha}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.m; } - static size_t GetSizeY(const Arguments &args) { return args.n; } - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS1", {4, 8, 16, 32, 64, 128, 256, 512}); - tuner.AddParameter(id, "WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256}); - tuner.AddParameter(id, "WPT", {1, 2, 4}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"WGS1", "WGS2"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"WPT", "WPT"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &x_vec, std::vector &y_vec, - std::vector &a_mat, std::vector &, std::vector &, - std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); // x_offset - tuner.AddArgumentScalar(1); // x_increment - tuner.AddArgumentInput(y_vec); - tuner.AddArgumentScalar(0); // y_offset - tuner.AddArgumentScalar(1); // y_increment - tuner.AddArgumentOutput(a_mat); - tuner.AddArgumentScalar(0); // a_offset - tuner.AddArgumentScalar(static_cast(args.m)); // a_ld - tuner.AddArgumentScalar(0); // a_is_rowmajor - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp new file mode 100644 index 00000000..d2590c53 --- /dev/null +++ b/src/tuning/kernels/xger.cpp @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the xger OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneXger { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "xger"; } + static std::string KernelName() { return "Xger"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level2/level2.opencl" + #include "../src/kernels/level2/xger.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgN, kArgM, kArgAlpha}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { return args.m; } + static size_t GetSizeY(const Arguments &args) { return args.n; } + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "WGS1", {4, 8, 16, 32, 64, 128, 256, 512}); + tuner.AddParameter(id, "WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256}); + tuner.AddParameter(id, "WPT", {1, 2, 4}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"WGS1", "WGS2"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"WPT", "WPT"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &x_vec, std::vector &y_vec, + std::vector &a_mat, std::vector &, std::vector &, + std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentScalar(0); // x_offset + tuner.AddArgumentScalar(1); // x_increment + tuner.AddArgumentInput(y_vec); + tuner.AddArgumentScalar(0); // y_offset + tuner.AddArgumentScalar(1); // y_increment + tuner.AddArgumentOutput(a_mat); + tuner.AddArgumentScalar(0); // a_offset + tuner.AddArgumentScalar(static_cast(args.m)); // a_ld + tuner.AddArgumentScalar(0); // a_is_rowmajor + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/utilities.cc b/src/utilities.cc deleted file mode 100644 index e3a1fb75..00000000 --- a/src/utilities.cc +++ /dev/null @@ -1,390 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the common (test) utility functions. -// -// ================================================================================================= - -#include "utilities.hpp" - -#include -#include -#include -#include -#include - -namespace clblast { -// ================================================================================================= - -// Returns a scalar with a default value -template -T GetScalar() { - return static_cast(2.0); -} -template float GetScalar(); -template double GetScalar(); - -// Specialized version of the above for half-precision -template <> -half GetScalar() { - return FloatToHalf(2.0f); -} - -// Specialized versions of the above for complex data-types -template <> -float2 GetScalar() { - return {2.0f, 0.5f}; -} -template <> -double2 GetScalar() { - return {2.0, 0.5}; -} - -// Returns a scalar of value 1 -template -T ConstantOne() { - return static_cast(1.0); -} -template float ConstantOne(); -template double ConstantOne(); - -// Specialized version of the above for half-precision -template <> -half ConstantOne() { - return FloatToHalf(1.0f); -} - -// Specialized versions of the above for complex data-types -template <> -float2 ConstantOne() { - return {1.0f, 0.0f}; -} -template <> -double2 ConstantOne() { - return {1.0, 0.0}; -} - -// ================================================================================================= - -// Implements the string conversion using std::to_string if possible -template -std::string ToString(T value) { - return std::to_string(value); -} -template std::string ToString(int value); -template std::string ToString(size_t value); -template std::string ToString(float value); -template std::string ToString(double value); - -// If not possible directly: special cases for complex data-types -template <> -std::string ToString(float2 value) { - std::ostringstream real, imag; - real << std::setprecision(2) << value.real(); - imag << std::setprecision(2) << value.imag(); - return real.str()+"+"+imag.str()+"i"; -} -template <> -std::string ToString(double2 value) { - std::ostringstream real, imag; - real << std::setprecision(2) << value.real(); - imag << std::setprecision(2) << value.imag(); - return real.str()+"+"+imag.str()+"i"; -} - -// If not possible directly: special case for half-precision -template <> -std::string ToString(half value) { - return std::to_string(HalfToFloat(value)); -} - -// If not possible directly: special cases for CLBlast data-types -template <> -std::string ToString(Layout value) { - switch(value) { - case Layout::kRowMajor: return ToString(static_cast(value))+" (row-major)"; - case Layout::kColMajor: return ToString(static_cast(value))+" (col-major)"; - } -} -template <> -std::string ToString(Transpose value) { - switch(value) { - case Transpose::kNo: return ToString(static_cast(value))+" (regular)"; - case Transpose::kYes: return ToString(static_cast(value))+" (transposed)"; - case Transpose::kConjugate: return ToString(static_cast(value))+" (conjugate)"; - } -} -template <> -std::string ToString(Side value) { - switch(value) { - case Side::kLeft: return ToString(static_cast(value))+" (left)"; - case Side::kRight: return ToString(static_cast(value))+" (right)"; - } -} -template <> -std::string ToString(Triangle value) { - switch(value) { - case Triangle::kUpper: return ToString(static_cast(value))+" (upper)"; - case Triangle::kLower: return ToString(static_cast(value))+" (lower)"; - } -} -template <> -std::string ToString(Diagonal value) { - switch(value) { - case Diagonal::kUnit: return ToString(static_cast(value))+" (unit)"; - case Diagonal::kNonUnit: return ToString(static_cast(value))+" (non-unit)"; - } -} -template <> -std::string ToString(Precision value) { - switch(value) { - case Precision::kHalf: return ToString(static_cast(value))+" (half)"; - case Precision::kSingle: return ToString(static_cast(value))+" (single)"; - case Precision::kDouble: return ToString(static_cast(value))+" (double)"; - case Precision::kComplexSingle: return ToString(static_cast(value))+" (complex-single)"; - case Precision::kComplexDouble: return ToString(static_cast(value))+" (complex-double)"; - } -} - -// ================================================================================================= - -// Helper for the below function to convert the argument to the value type. Adds specialization for -// complex data-types. Note that complex arguments are accepted as regular values and are copied to -// both the real and imaginary parts. -template -T ConvertArgument(const char* value) { - return static_cast(std::stoi(value)); -} -template <> half ConvertArgument(const char* value) { - return FloatToHalf(static_cast(std::stod(value))); -} -template <> float ConvertArgument(const char* value) { - return static_cast(std::stod(value)); -} -template <> double ConvertArgument(const char* value) { - return static_cast(std::stod(value)); -} -template <> float2 ConvertArgument(const char* value) { - auto val = static_cast(std::stod(value)); - return float2{val, val}; -} -template <> double2 ConvertArgument(const char* value) { - auto val = static_cast(std::stod(value)); - return double2{val, val}; -} - -// This function matches patterns in the form of "-option value" or "--option value". It returns a -// default value in case the option is not found in the argument string. -template -T GetArgument(const int argc, char *argv[], std::string &help, - const std::string &option, const T default_value) { - - // Parses the argument. Note that this supports both the given option (e.g. -device) and one with - // an extra dash in front (e.g. --device). - auto return_value = static_cast(default_value); - for (int c=0; c(argv[c]); - break; - } - } - - // Updates the help message and returns - help += " -"+option+" "+ToString(return_value)+" "; - help += (return_value == default_value) ? "[=default]\n" : "\n"; - return return_value; -} - -// Compiles the above function -template int GetArgument(const int, char **, std::string&, const std::string&, const int); -template size_t GetArgument(const int, char **, std::string&, const std::string&, const size_t); -template half GetArgument(const int, char **, std::string&, const std::string&, const half); -template float GetArgument(const int, char **, std::string&, const std::string&, const float); -template double GetArgument(const int, char **, std::string&, const std::string&, const double); -template float2 GetArgument(const int, char **, std::string&, const std::string&, const float2); -template double2 GetArgument(const int, char **, std::string&, const std::string&, const double2); -template Layout GetArgument(const int, char **, std::string&, const std::string&, const Layout); -template Transpose GetArgument(const int, char **, std::string&, const std::string&, const Transpose); -template Side GetArgument(const int, char **, std::string&, const std::string&, const Side); -template Triangle GetArgument(const int, char **, std::string&, const std::string&, const Triangle); -template Diagonal GetArgument(const int, char **, std::string&, const std::string&, const Diagonal); -template Precision GetArgument(const int, char **, std::string&, const std::string&, const Precision); - -// ================================================================================================= - -// Returns only the precision argument -Precision GetPrecision(const int argc, char *argv[], const Precision default_precision) { - auto dummy = std::string{}; - return GetArgument(argc, argv, dummy, kArgPrecision, default_precision); -} - -// ================================================================================================= - -// Checks whether an argument is given. Returns true or false. -bool CheckArgument(const int argc, char *argv[], std::string &help, - const std::string &option) { - - // Parses the argument. Note that this supports both the given option (e.g. -device) and one with - // an extra dash in front (e.g. --device). - auto return_value = false; - for (int c=0; c(std::chrono::system_clock::now().time_since_epoch().count()); -} - -// Create a random number generator and populates a vector with samples from a random distribution -template -void PopulateVector(std::vector &vector) { - auto lower_limit = static_cast(kTestDataLowerLimit); - auto upper_limit = static_cast(kTestDataUpperLimit); - std::mt19937 mt(GetRandomSeed()); - std::uniform_real_distribution dist(lower_limit, upper_limit); - for (auto &element: vector) { element = dist(mt); } -} -template void PopulateVector(std::vector&); -template void PopulateVector(std::vector&); - -// Specialized versions of the above for complex data-types -template <> -void PopulateVector(std::vector &vector) { - auto lower_limit = static_cast(kTestDataLowerLimit); - auto upper_limit = static_cast(kTestDataUpperLimit); - std::mt19937 mt(GetRandomSeed()); - std::uniform_real_distribution dist(lower_limit, upper_limit); - for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } -} -template <> -void PopulateVector(std::vector &vector) { - auto lower_limit = static_cast(kTestDataLowerLimit); - auto upper_limit = static_cast(kTestDataUpperLimit); - std::mt19937 mt(GetRandomSeed()); - std::uniform_real_distribution dist(lower_limit, upper_limit); - for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } -} - -// Specialized versions of the above for half-precision -template <> -void PopulateVector(std::vector &vector) { - const auto lower_limit = static_cast(kTestDataLowerLimit); - const auto upper_limit = static_cast(kTestDataUpperLimit); - std::mt19937 mt(GetRandomSeed()); - std::uniform_real_distribution dist(lower_limit, upper_limit); - for (auto &element: vector) { element = FloatToHalf(dist(mt)); } -} - -// ================================================================================================= - -// Conversion between half and single-precision -std::vector HalfToFloatBuffer(const std::vector& source) { - auto result = std::vector(source.size()); - for (auto i = size_t(0); i < source.size(); ++i) { result[i] = HalfToFloat(source[i]); } - return result; -} -void FloatToHalfBuffer(std::vector& result, const std::vector& source) { - for (auto i = size_t(0); i < source.size(); ++i) { result[i] = FloatToHalf(source[i]); } -} - -// As above, but now for OpenCL data-types instead of std::vectors -Buffer HalfToFloatBuffer(const Buffer& source, cl_command_queue queue_raw) { - const auto size = source.GetSize() / sizeof(half); - auto queue = Queue(queue_raw); - auto context = queue.GetContext(); - auto source_cpu = std::vector(size); - source.Read(queue, size, source_cpu); - auto result_cpu = HalfToFloatBuffer(source_cpu); - auto result = Buffer(context, size); - result.Write(queue, size, result_cpu); - return result; -} -void FloatToHalfBuffer(Buffer& result, const Buffer& source, cl_command_queue queue_raw) { - const auto size = source.GetSize() / sizeof(float); - auto queue = Queue(queue_raw); - auto context = queue.GetContext(); - auto source_cpu = std::vector(size); - source.Read(queue, size, source_cpu); - auto result_cpu = std::vector(size); - FloatToHalfBuffer(result_cpu, source_cpu); - result.Write(queue, size, result_cpu); -} - -// ================================================================================================= - -// Rounding functions performing ceiling and division operations -size_t CeilDiv(const size_t x, const size_t y) { - return 1 + ((x - 1) / y); -} -size_t Ceil(const size_t x, const size_t y) { - return CeilDiv(x,y)*y; -} - -// Helper function to determine whether or not 'a' is a multiple of 'b' -bool IsMultiple(const size_t a, const size_t b) { - return ((a/b)*b == a) ? true : false; -}; - -// ================================================================================================= - -// Convert the precision enum (as integer) into bytes -size_t GetBytes(const Precision precision) { - switch(precision) { - case Precision::kHalf: return 2; - case Precision::kSingle: return 4; - case Precision::kDouble: return 8; - case Precision::kComplexSingle: return 8; - case Precision::kComplexDouble: return 16; - } -} - -// Convert the template argument into a precision value -template <> Precision PrecisionValue() { return Precision::kHalf; } -template <> Precision PrecisionValue() { return Precision::kSingle; } -template <> Precision PrecisionValue() { return Precision::kDouble; } -template <> Precision PrecisionValue() { return Precision::kComplexSingle; } -template <> Precision PrecisionValue() { return Precision::kComplexDouble; } - -// ================================================================================================= - -// Returns false is this precision is not supported by the device -template <> bool PrecisionSupported(const Device &) { return true; } -template <> bool PrecisionSupported(const Device &) { return true; } -template <> bool PrecisionSupported(const Device &device) { - auto extensions = device.Capabilities(); - return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true; -} -template <> bool PrecisionSupported(const Device &device) { - auto extensions = device.Capabilities(); - return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true; -} -template <> bool PrecisionSupported(const Device &device) { - auto extensions = device.Capabilities(); - if (device.Name() == "Mali-T628") { return true; } // supports fp16 but not cl_khr_fp16 officially - return (extensions.find(kKhronosHalfPrecision) == std::string::npos) ? false : true; -} - -// ================================================================================================= -} // namespace clblast diff --git a/src/utilities.cpp b/src/utilities.cpp new file mode 100644 index 00000000..e3a1fb75 --- /dev/null +++ b/src/utilities.cpp @@ -0,0 +1,390 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the common (test) utility functions. +// +// ================================================================================================= + +#include "utilities.hpp" + +#include +#include +#include +#include +#include + +namespace clblast { +// ================================================================================================= + +// Returns a scalar with a default value +template +T GetScalar() { + return static_cast(2.0); +} +template float GetScalar(); +template double GetScalar(); + +// Specialized version of the above for half-precision +template <> +half GetScalar() { + return FloatToHalf(2.0f); +} + +// Specialized versions of the above for complex data-types +template <> +float2 GetScalar() { + return {2.0f, 0.5f}; +} +template <> +double2 GetScalar() { + return {2.0, 0.5}; +} + +// Returns a scalar of value 1 +template +T ConstantOne() { + return static_cast(1.0); +} +template float ConstantOne(); +template double ConstantOne(); + +// Specialized version of the above for half-precision +template <> +half ConstantOne() { + return FloatToHalf(1.0f); +} + +// Specialized versions of the above for complex data-types +template <> +float2 ConstantOne() { + return {1.0f, 0.0f}; +} +template <> +double2 ConstantOne() { + return {1.0, 0.0}; +} + +// ================================================================================================= + +// Implements the string conversion using std::to_string if possible +template +std::string ToString(T value) { + return std::to_string(value); +} +template std::string ToString(int value); +template std::string ToString(size_t value); +template std::string ToString(float value); +template std::string ToString(double value); + +// If not possible directly: special cases for complex data-types +template <> +std::string ToString(float2 value) { + std::ostringstream real, imag; + real << std::setprecision(2) << value.real(); + imag << std::setprecision(2) << value.imag(); + return real.str()+"+"+imag.str()+"i"; +} +template <> +std::string ToString(double2 value) { + std::ostringstream real, imag; + real << std::setprecision(2) << value.real(); + imag << std::setprecision(2) << value.imag(); + return real.str()+"+"+imag.str()+"i"; +} + +// If not possible directly: special case for half-precision +template <> +std::string ToString(half value) { + return std::to_string(HalfToFloat(value)); +} + +// If not possible directly: special cases for CLBlast data-types +template <> +std::string ToString(Layout value) { + switch(value) { + case Layout::kRowMajor: return ToString(static_cast(value))+" (row-major)"; + case Layout::kColMajor: return ToString(static_cast(value))+" (col-major)"; + } +} +template <> +std::string ToString(Transpose value) { + switch(value) { + case Transpose::kNo: return ToString(static_cast(value))+" (regular)"; + case Transpose::kYes: return ToString(static_cast(value))+" (transposed)"; + case Transpose::kConjugate: return ToString(static_cast(value))+" (conjugate)"; + } +} +template <> +std::string ToString(Side value) { + switch(value) { + case Side::kLeft: return ToString(static_cast(value))+" (left)"; + case Side::kRight: return ToString(static_cast(value))+" (right)"; + } +} +template <> +std::string ToString(Triangle value) { + switch(value) { + case Triangle::kUpper: return ToString(static_cast(value))+" (upper)"; + case Triangle::kLower: return ToString(static_cast(value))+" (lower)"; + } +} +template <> +std::string ToString(Diagonal value) { + switch(value) { + case Diagonal::kUnit: return ToString(static_cast(value))+" (unit)"; + case Diagonal::kNonUnit: return ToString(static_cast(value))+" (non-unit)"; + } +} +template <> +std::string ToString(Precision value) { + switch(value) { + case Precision::kHalf: return ToString(static_cast(value))+" (half)"; + case Precision::kSingle: return ToString(static_cast(value))+" (single)"; + case Precision::kDouble: return ToString(static_cast(value))+" (double)"; + case Precision::kComplexSingle: return ToString(static_cast(value))+" (complex-single)"; + case Precision::kComplexDouble: return ToString(static_cast(value))+" (complex-double)"; + } +} + +// ================================================================================================= + +// Helper for the below function to convert the argument to the value type. Adds specialization for +// complex data-types. Note that complex arguments are accepted as regular values and are copied to +// both the real and imaginary parts. +template +T ConvertArgument(const char* value) { + return static_cast(std::stoi(value)); +} +template <> half ConvertArgument(const char* value) { + return FloatToHalf(static_cast(std::stod(value))); +} +template <> float ConvertArgument(const char* value) { + return static_cast(std::stod(value)); +} +template <> double ConvertArgument(const char* value) { + return static_cast(std::stod(value)); +} +template <> float2 ConvertArgument(const char* value) { + auto val = static_cast(std::stod(value)); + return float2{val, val}; +} +template <> double2 ConvertArgument(const char* value) { + auto val = static_cast(std::stod(value)); + return double2{val, val}; +} + +// This function matches patterns in the form of "-option value" or "--option value". It returns a +// default value in case the option is not found in the argument string. +template +T GetArgument(const int argc, char *argv[], std::string &help, + const std::string &option, const T default_value) { + + // Parses the argument. Note that this supports both the given option (e.g. -device) and one with + // an extra dash in front (e.g. --device). + auto return_value = static_cast(default_value); + for (int c=0; c(argv[c]); + break; + } + } + + // Updates the help message and returns + help += " -"+option+" "+ToString(return_value)+" "; + help += (return_value == default_value) ? "[=default]\n" : "\n"; + return return_value; +} + +// Compiles the above function +template int GetArgument(const int, char **, std::string&, const std::string&, const int); +template size_t GetArgument(const int, char **, std::string&, const std::string&, const size_t); +template half GetArgument(const int, char **, std::string&, const std::string&, const half); +template float GetArgument(const int, char **, std::string&, const std::string&, const float); +template double GetArgument(const int, char **, std::string&, const std::string&, const double); +template float2 GetArgument(const int, char **, std::string&, const std::string&, const float2); +template double2 GetArgument(const int, char **, std::string&, const std::string&, const double2); +template Layout GetArgument(const int, char **, std::string&, const std::string&, const Layout); +template Transpose GetArgument(const int, char **, std::string&, const std::string&, const Transpose); +template Side GetArgument(const int, char **, std::string&, const std::string&, const Side); +template Triangle GetArgument(const int, char **, std::string&, const std::string&, const Triangle); +template Diagonal GetArgument(const int, char **, std::string&, const std::string&, const Diagonal); +template Precision GetArgument(const int, char **, std::string&, const std::string&, const Precision); + +// ================================================================================================= + +// Returns only the precision argument +Precision GetPrecision(const int argc, char *argv[], const Precision default_precision) { + auto dummy = std::string{}; + return GetArgument(argc, argv, dummy, kArgPrecision, default_precision); +} + +// ================================================================================================= + +// Checks whether an argument is given. Returns true or false. +bool CheckArgument(const int argc, char *argv[], std::string &help, + const std::string &option) { + + // Parses the argument. Note that this supports both the given option (e.g. -device) and one with + // an extra dash in front (e.g. --device). + auto return_value = false; + for (int c=0; c(std::chrono::system_clock::now().time_since_epoch().count()); +} + +// Create a random number generator and populates a vector with samples from a random distribution +template +void PopulateVector(std::vector &vector) { + auto lower_limit = static_cast(kTestDataLowerLimit); + auto upper_limit = static_cast(kTestDataUpperLimit); + std::mt19937 mt(GetRandomSeed()); + std::uniform_real_distribution dist(lower_limit, upper_limit); + for (auto &element: vector) { element = dist(mt); } +} +template void PopulateVector(std::vector&); +template void PopulateVector(std::vector&); + +// Specialized versions of the above for complex data-types +template <> +void PopulateVector(std::vector &vector) { + auto lower_limit = static_cast(kTestDataLowerLimit); + auto upper_limit = static_cast(kTestDataUpperLimit); + std::mt19937 mt(GetRandomSeed()); + std::uniform_real_distribution dist(lower_limit, upper_limit); + for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } +} +template <> +void PopulateVector(std::vector &vector) { + auto lower_limit = static_cast(kTestDataLowerLimit); + auto upper_limit = static_cast(kTestDataUpperLimit); + std::mt19937 mt(GetRandomSeed()); + std::uniform_real_distribution dist(lower_limit, upper_limit); + for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } +} + +// Specialized versions of the above for half-precision +template <> +void PopulateVector(std::vector &vector) { + const auto lower_limit = static_cast(kTestDataLowerLimit); + const auto upper_limit = static_cast(kTestDataUpperLimit); + std::mt19937 mt(GetRandomSeed()); + std::uniform_real_distribution dist(lower_limit, upper_limit); + for (auto &element: vector) { element = FloatToHalf(dist(mt)); } +} + +// ================================================================================================= + +// Conversion between half and single-precision +std::vector HalfToFloatBuffer(const std::vector& source) { + auto result = std::vector(source.size()); + for (auto i = size_t(0); i < source.size(); ++i) { result[i] = HalfToFloat(source[i]); } + return result; +} +void FloatToHalfBuffer(std::vector& result, const std::vector& source) { + for (auto i = size_t(0); i < source.size(); ++i) { result[i] = FloatToHalf(source[i]); } +} + +// As above, but now for OpenCL data-types instead of std::vectors +Buffer HalfToFloatBuffer(const Buffer& source, cl_command_queue queue_raw) { + const auto size = source.GetSize() / sizeof(half); + auto queue = Queue(queue_raw); + auto context = queue.GetContext(); + auto source_cpu = std::vector(size); + source.Read(queue, size, source_cpu); + auto result_cpu = HalfToFloatBuffer(source_cpu); + auto result = Buffer(context, size); + result.Write(queue, size, result_cpu); + return result; +} +void FloatToHalfBuffer(Buffer& result, const Buffer& source, cl_command_queue queue_raw) { + const auto size = source.GetSize() / sizeof(float); + auto queue = Queue(queue_raw); + auto context = queue.GetContext(); + auto source_cpu = std::vector(size); + source.Read(queue, size, source_cpu); + auto result_cpu = std::vector(size); + FloatToHalfBuffer(result_cpu, source_cpu); + result.Write(queue, size, result_cpu); +} + +// ================================================================================================= + +// Rounding functions performing ceiling and division operations +size_t CeilDiv(const size_t x, const size_t y) { + return 1 + ((x - 1) / y); +} +size_t Ceil(const size_t x, const size_t y) { + return CeilDiv(x,y)*y; +} + +// Helper function to determine whether or not 'a' is a multiple of 'b' +bool IsMultiple(const size_t a, const size_t b) { + return ((a/b)*b == a) ? true : false; +}; + +// ================================================================================================= + +// Convert the precision enum (as integer) into bytes +size_t GetBytes(const Precision precision) { + switch(precision) { + case Precision::kHalf: return 2; + case Precision::kSingle: return 4; + case Precision::kDouble: return 8; + case Precision::kComplexSingle: return 8; + case Precision::kComplexDouble: return 16; + } +} + +// Convert the template argument into a precision value +template <> Precision PrecisionValue() { return Precision::kHalf; } +template <> Precision PrecisionValue() { return Precision::kSingle; } +template <> Precision PrecisionValue() { return Precision::kDouble; } +template <> Precision PrecisionValue() { return Precision::kComplexSingle; } +template <> Precision PrecisionValue() { return Precision::kComplexDouble; } + +// ================================================================================================= + +// Returns false is this precision is not supported by the device +template <> bool PrecisionSupported(const Device &) { return true; } +template <> bool PrecisionSupported(const Device &) { return true; } +template <> bool PrecisionSupported(const Device &device) { + auto extensions = device.Capabilities(); + return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true; +} +template <> bool PrecisionSupported(const Device &device) { + auto extensions = device.Capabilities(); + return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true; +} +template <> bool PrecisionSupported(const Device &device) { + auto extensions = device.Capabilities(); + if (device.Name() == "Mali-T628") { return true; } // supports fp16 but not cl_khr_fp16 officially + return (extensions.find(kKhronosHalfPrecision) == std::string::npos) ? false : true; +} + +// ================================================================================================= +} // namespace clblast diff --git a/test/correctness/routines/level1/xamax.cc b/test/correctness/routines/level1/xamax.cc deleted file mode 100644 index 607637e8..00000000 --- a/test/correctness/routines/level1/xamax.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xamax.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "iSAMAX"); - errors += clblast::RunTests, double, double>(argc, argv, true, "iDAMAX"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "iCAMAX"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "iZAMAX"); - errors += clblast::RunTests, half, half>(argc, argv, true, "iHAMAX"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xamax.cpp b/test/correctness/routines/level1/xamax.cpp new file mode 100644 index 00000000..607637e8 --- /dev/null +++ b/test/correctness/routines/level1/xamax.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xamax.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "iSAMAX"); + errors += clblast::RunTests, double, double>(argc, argv, true, "iDAMAX"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "iCAMAX"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "iZAMAX"); + errors += clblast::RunTests, half, half>(argc, argv, true, "iHAMAX"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xasum.cc b/test/correctness/routines/level1/xasum.cc deleted file mode 100644 index e22e42a6..00000000 --- a/test/correctness/routines/level1/xasum.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xasum.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SASUM"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DASUM"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "ScASUM"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "DzASUM"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HASUM"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xasum.cpp b/test/correctness/routines/level1/xasum.cpp new file mode 100644 index 00000000..e22e42a6 --- /dev/null +++ b/test/correctness/routines/level1/xasum.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xasum.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SASUM"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DASUM"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "ScASUM"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "DzASUM"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HASUM"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xaxpy.cc b/test/correctness/routines/level1/xaxpy.cc deleted file mode 100644 index 064172fa..00000000 --- a/test/correctness/routines/level1/xaxpy.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xaxpy.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SAXPY"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DAXPY"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CAXPY"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZAXPY"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HAXPY"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xaxpy.cpp b/test/correctness/routines/level1/xaxpy.cpp new file mode 100644 index 00000000..064172fa --- /dev/null +++ b/test/correctness/routines/level1/xaxpy.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xaxpy.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SAXPY"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DAXPY"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CAXPY"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZAXPY"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HAXPY"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xcopy.cc b/test/correctness/routines/level1/xcopy.cc deleted file mode 100644 index e6f2581b..00000000 --- a/test/correctness/routines/level1/xcopy.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xcopy.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SCOPY"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DCOPY"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CCOPY"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZCOPY"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HCOPY"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xcopy.cpp b/test/correctness/routines/level1/xcopy.cpp new file mode 100644 index 00000000..e6f2581b --- /dev/null +++ b/test/correctness/routines/level1/xcopy.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xcopy.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SCOPY"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DCOPY"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CCOPY"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZCOPY"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HCOPY"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xdot.cc b/test/correctness/routines/level1/xdot.cc deleted file mode 100644 index 080250cb..00000000 --- a/test/correctness/routines/level1/xdot.cc +++ /dev/null @@ -1,28 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xdot.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SDOT"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DDOT"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HDOT"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xdot.cpp b/test/correctness/routines/level1/xdot.cpp new file mode 100644 index 00000000..080250cb --- /dev/null +++ b/test/correctness/routines/level1/xdot.cpp @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xdot.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SDOT"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DDOT"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HDOT"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xdotc.cc b/test/correctness/routines/level1/xdotc.cc deleted file mode 100644 index 2a7bbeca..00000000 --- a/test/correctness/routines/level1/xdotc.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xdotc.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float2>(argc, argv, false, "CDOTC"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZDOTC"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xdotc.cpp b/test/correctness/routines/level1/xdotc.cpp new file mode 100644 index 00000000..2a7bbeca --- /dev/null +++ b/test/correctness/routines/level1/xdotc.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xdotc.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float2>(argc, argv, false, "CDOTC"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZDOTC"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xdotu.cc b/test/correctness/routines/level1/xdotu.cc deleted file mode 100644 index 1047d021..00000000 --- a/test/correctness/routines/level1/xdotu.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xdotu.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float2>(argc, argv, false, "CDOTU"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZDOTU"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xdotu.cpp b/test/correctness/routines/level1/xdotu.cpp new file mode 100644 index 00000000..1047d021 --- /dev/null +++ b/test/correctness/routines/level1/xdotu.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xdotu.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float2>(argc, argv, false, "CDOTU"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZDOTU"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xnrm2.cc b/test/correctness/routines/level1/xnrm2.cc deleted file mode 100644 index 142fa7ba..00000000 --- a/test/correctness/routines/level1/xnrm2.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xnrm2.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SNRM2"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DNRM2"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "ScNRM2"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "DzNRM2"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HNRM2"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xnrm2.cpp b/test/correctness/routines/level1/xnrm2.cpp new file mode 100644 index 00000000..142fa7ba --- /dev/null +++ b/test/correctness/routines/level1/xnrm2.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xnrm2.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SNRM2"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DNRM2"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "ScNRM2"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "DzNRM2"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HNRM2"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xrot.cc b/test/correctness/routines/level1/xrot.cc deleted file mode 100644 index 5af358eb..00000000 --- a/test/correctness/routines/level1/xrot.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xrot.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SROT"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DROT"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xrot.cpp b/test/correctness/routines/level1/xrot.cpp new file mode 100644 index 00000000..5af358eb --- /dev/null +++ b/test/correctness/routines/level1/xrot.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xrot.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SROT"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DROT"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xrotg.cc b/test/correctness/routines/level1/xrotg.cc deleted file mode 100644 index ad23a554..00000000 --- a/test/correctness/routines/level1/xrotg.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xrotg.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SROTG"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DROTG"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xrotg.cpp b/test/correctness/routines/level1/xrotg.cpp new file mode 100644 index 00000000..ad23a554 --- /dev/null +++ b/test/correctness/routines/level1/xrotg.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xrotg.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SROTG"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DROTG"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xrotm.cc b/test/correctness/routines/level1/xrotm.cc deleted file mode 100644 index 4f7e8f15..00000000 --- a/test/correctness/routines/level1/xrotm.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xrotm.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SROTM"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DROTM"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xrotm.cpp b/test/correctness/routines/level1/xrotm.cpp new file mode 100644 index 00000000..4f7e8f15 --- /dev/null +++ b/test/correctness/routines/level1/xrotm.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xrotm.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SROTM"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DROTM"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xrotmg.cc b/test/correctness/routines/level1/xrotmg.cc deleted file mode 100644 index ca89bc12..00000000 --- a/test/correctness/routines/level1/xrotmg.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xrotmg.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SROTMG"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DROTMG"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xrotmg.cpp b/test/correctness/routines/level1/xrotmg.cpp new file mode 100644 index 00000000..ca89bc12 --- /dev/null +++ b/test/correctness/routines/level1/xrotmg.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xrotmg.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SROTMG"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DROTMG"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xscal.cc b/test/correctness/routines/level1/xscal.cc deleted file mode 100644 index 939524be..00000000 --- a/test/correctness/routines/level1/xscal.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xscal.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SSCAL"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DSCAL"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CSCAL"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZSCAL"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HSCAL"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xscal.cpp b/test/correctness/routines/level1/xscal.cpp new file mode 100644 index 00000000..939524be --- /dev/null +++ b/test/correctness/routines/level1/xscal.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xscal.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SSCAL"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DSCAL"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CSCAL"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZSCAL"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HSCAL"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xswap.cc b/test/correctness/routines/level1/xswap.cc deleted file mode 100644 index 446f3d65..00000000 --- a/test/correctness/routines/level1/xswap.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level1/xswap.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SSWAP"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DSWAP"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CSWAP"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZSWAP"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HSWAP"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level1/xswap.cpp b/test/correctness/routines/level1/xswap.cpp new file mode 100644 index 00000000..446f3d65 --- /dev/null +++ b/test/correctness/routines/level1/xswap.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level1/xswap.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SSWAP"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DSWAP"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CSWAP"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZSWAP"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HSWAP"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xgbmv.cc b/test/correctness/routines/level2/xgbmv.cc deleted file mode 100644 index 8c49bc65..00000000 --- a/test/correctness/routines/level2/xgbmv.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xgbmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SGBMV"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DGBMV"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CGBMV"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZGBMV"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HGBMV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xgbmv.cpp b/test/correctness/routines/level2/xgbmv.cpp new file mode 100644 index 00000000..8c49bc65 --- /dev/null +++ b/test/correctness/routines/level2/xgbmv.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xgbmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SGBMV"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DGBMV"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CGBMV"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZGBMV"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HGBMV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xgemv.cc b/test/correctness/routines/level2/xgemv.cc deleted file mode 100644 index 902ae777..00000000 --- a/test/correctness/routines/level2/xgemv.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xgemv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SGEMV"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DGEMV"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CGEMV"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZGEMV"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HGEMV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xgemv.cpp b/test/correctness/routines/level2/xgemv.cpp new file mode 100644 index 00000000..902ae777 --- /dev/null +++ b/test/correctness/routines/level2/xgemv.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xgemv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SGEMV"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DGEMV"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CGEMV"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZGEMV"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HGEMV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xger.cc b/test/correctness/routines/level2/xger.cc deleted file mode 100644 index ce61bbcb..00000000 --- a/test/correctness/routines/level2/xger.cc +++ /dev/null @@ -1,28 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xger.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SGER"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DGER"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HGER"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xger.cpp b/test/correctness/routines/level2/xger.cpp new file mode 100644 index 00000000..ce61bbcb --- /dev/null +++ b/test/correctness/routines/level2/xger.cpp @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xger.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SGER"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DGER"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HGER"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xgerc.cc b/test/correctness/routines/level2/xgerc.cc deleted file mode 100644 index b747f20d..00000000 --- a/test/correctness/routines/level2/xgerc.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xgerc.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float2>(argc, argv, false, "CGERC"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZGERC"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xgerc.cpp b/test/correctness/routines/level2/xgerc.cpp new file mode 100644 index 00000000..b747f20d --- /dev/null +++ b/test/correctness/routines/level2/xgerc.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xgerc.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float2>(argc, argv, false, "CGERC"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZGERC"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xgeru.cc b/test/correctness/routines/level2/xgeru.cc deleted file mode 100644 index f80c1e2b..00000000 --- a/test/correctness/routines/level2/xgeru.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xgeru.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float2>(argc, argv, false, "CGERU"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZGERU"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xgeru.cpp b/test/correctness/routines/level2/xgeru.cpp new file mode 100644 index 00000000..f80c1e2b --- /dev/null +++ b/test/correctness/routines/level2/xgeru.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xgeru.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float2>(argc, argv, false, "CGERU"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZGERU"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhbmv.cc b/test/correctness/routines/level2/xhbmv.cc deleted file mode 100644 index a4885c01..00000000 --- a/test/correctness/routines/level2/xhbmv.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xhbmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float2>(argc, argv, false, "CHBMV"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZHBMV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xhbmv.cpp b/test/correctness/routines/level2/xhbmv.cpp new file mode 100644 index 00000000..a4885c01 --- /dev/null +++ b/test/correctness/routines/level2/xhbmv.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xhbmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float2>(argc, argv, false, "CHBMV"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZHBMV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhemv.cc b/test/correctness/routines/level2/xhemv.cc deleted file mode 100644 index 4318ffee..00000000 --- a/test/correctness/routines/level2/xhemv.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xhemv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float2>(argc, argv, false, "CHEMV"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZHEMV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xhemv.cpp b/test/correctness/routines/level2/xhemv.cpp new file mode 100644 index 00000000..4318ffee --- /dev/null +++ b/test/correctness/routines/level2/xhemv.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xhemv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float2>(argc, argv, false, "CHEMV"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZHEMV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xher.cc b/test/correctness/routines/level2/xher.cc deleted file mode 100644 index fe37bd76..00000000 --- a/test/correctness/routines/level2/xher.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xher.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float>(argc, argv, false, "CHER"); - errors += clblast::RunTests, double2, double>(argc, argv, true, "ZHER"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xher.cpp b/test/correctness/routines/level2/xher.cpp new file mode 100644 index 00000000..fe37bd76 --- /dev/null +++ b/test/correctness/routines/level2/xher.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xher.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float>(argc, argv, false, "CHER"); + errors += clblast::RunTests, double2, double>(argc, argv, true, "ZHER"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xher2.cc b/test/correctness/routines/level2/xher2.cc deleted file mode 100644 index 0b4af4d0..00000000 --- a/test/correctness/routines/level2/xher2.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xher2.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float2>(argc, argv, false, "CHER2"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZHER2"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xher2.cpp b/test/correctness/routines/level2/xher2.cpp new file mode 100644 index 00000000..0b4af4d0 --- /dev/null +++ b/test/correctness/routines/level2/xher2.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xher2.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float2>(argc, argv, false, "CHER2"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZHER2"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhpmv.cc b/test/correctness/routines/level2/xhpmv.cc deleted file mode 100644 index dd77df71..00000000 --- a/test/correctness/routines/level2/xhpmv.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xhpmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float2>(argc, argv, false, "CHPMV"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZHPMV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xhpmv.cpp b/test/correctness/routines/level2/xhpmv.cpp new file mode 100644 index 00000000..dd77df71 --- /dev/null +++ b/test/correctness/routines/level2/xhpmv.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xhpmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float2>(argc, argv, false, "CHPMV"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZHPMV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhpr.cc b/test/correctness/routines/level2/xhpr.cc deleted file mode 100644 index 5a3f615f..00000000 --- a/test/correctness/routines/level2/xhpr.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xhpr.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float>(argc, argv, false, "CHPR"); - errors += clblast::RunTests, double2, double>(argc, argv, true, "ZHPR"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xhpr.cpp b/test/correctness/routines/level2/xhpr.cpp new file mode 100644 index 00000000..5a3f615f --- /dev/null +++ b/test/correctness/routines/level2/xhpr.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xhpr.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float>(argc, argv, false, "CHPR"); + errors += clblast::RunTests, double2, double>(argc, argv, true, "ZHPR"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhpr2.cc b/test/correctness/routines/level2/xhpr2.cc deleted file mode 100644 index 8218b444..00000000 --- a/test/correctness/routines/level2/xhpr2.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xhpr2.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float2>(argc, argv, false, "CHPR2"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZHPR2"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xhpr2.cpp b/test/correctness/routines/level2/xhpr2.cpp new file mode 100644 index 00000000..8218b444 --- /dev/null +++ b/test/correctness/routines/level2/xhpr2.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xhpr2.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float2>(argc, argv, false, "CHPR2"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZHPR2"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xsbmv.cc b/test/correctness/routines/level2/xsbmv.cc deleted file mode 100644 index 7918cb21..00000000 --- a/test/correctness/routines/level2/xsbmv.cc +++ /dev/null @@ -1,28 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xsbmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SSBMV"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DSBMV"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HSBMV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xsbmv.cpp b/test/correctness/routines/level2/xsbmv.cpp new file mode 100644 index 00000000..7918cb21 --- /dev/null +++ b/test/correctness/routines/level2/xsbmv.cpp @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xsbmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SSBMV"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DSBMV"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HSBMV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xspmv.cc b/test/correctness/routines/level2/xspmv.cc deleted file mode 100644 index 78210660..00000000 --- a/test/correctness/routines/level2/xspmv.cc +++ /dev/null @@ -1,28 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xspmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SSPMV"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DSPMV"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HSPMV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xspmv.cpp b/test/correctness/routines/level2/xspmv.cpp new file mode 100644 index 00000000..78210660 --- /dev/null +++ b/test/correctness/routines/level2/xspmv.cpp @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xspmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SSPMV"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DSPMV"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HSPMV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xspr.cc b/test/correctness/routines/level2/xspr.cc deleted file mode 100644 index d05adf34..00000000 --- a/test/correctness/routines/level2/xspr.cc +++ /dev/null @@ -1,28 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xspr.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SSPR"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DSPR"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HSPR"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xspr.cpp b/test/correctness/routines/level2/xspr.cpp new file mode 100644 index 00000000..d05adf34 --- /dev/null +++ b/test/correctness/routines/level2/xspr.cpp @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xspr.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SSPR"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DSPR"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HSPR"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xspr2.cc b/test/correctness/routines/level2/xspr2.cc deleted file mode 100644 index caa46a09..00000000 --- a/test/correctness/routines/level2/xspr2.cc +++ /dev/null @@ -1,28 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xspr2.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SSPR2"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DSPR2"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HSPR2"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xspr2.cpp b/test/correctness/routines/level2/xspr2.cpp new file mode 100644 index 00000000..caa46a09 --- /dev/null +++ b/test/correctness/routines/level2/xspr2.cpp @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xspr2.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SSPR2"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DSPR2"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HSPR2"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xsymv.cc b/test/correctness/routines/level2/xsymv.cc deleted file mode 100644 index 978a5f8a..00000000 --- a/test/correctness/routines/level2/xsymv.cc +++ /dev/null @@ -1,28 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xsymv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SSYMV"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DSYMV"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HSYMV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xsymv.cpp b/test/correctness/routines/level2/xsymv.cpp new file mode 100644 index 00000000..978a5f8a --- /dev/null +++ b/test/correctness/routines/level2/xsymv.cpp @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xsymv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SSYMV"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DSYMV"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HSYMV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xsyr.cc b/test/correctness/routines/level2/xsyr.cc deleted file mode 100644 index 244dbfb4..00000000 --- a/test/correctness/routines/level2/xsyr.cc +++ /dev/null @@ -1,28 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xsyr.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SSYR"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DSYR"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HSYR"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xsyr.cpp b/test/correctness/routines/level2/xsyr.cpp new file mode 100644 index 00000000..244dbfb4 --- /dev/null +++ b/test/correctness/routines/level2/xsyr.cpp @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xsyr.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SSYR"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DSYR"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HSYR"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xsyr2.cc b/test/correctness/routines/level2/xsyr2.cc deleted file mode 100644 index 422e67ad..00000000 --- a/test/correctness/routines/level2/xsyr2.cc +++ /dev/null @@ -1,28 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xsyr2.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SSYR2"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DSYR2"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HSYR2"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xsyr2.cpp b/test/correctness/routines/level2/xsyr2.cpp new file mode 100644 index 00000000..422e67ad --- /dev/null +++ b/test/correctness/routines/level2/xsyr2.cpp @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xsyr2.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SSYR2"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DSYR2"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HSYR2"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtbmv.cc b/test/correctness/routines/level2/xtbmv.cc deleted file mode 100644 index 491708ec..00000000 --- a/test/correctness/routines/level2/xtbmv.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xtbmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "STBMV"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DTBMV"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTBMV"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTBMV"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HTBMV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xtbmv.cpp b/test/correctness/routines/level2/xtbmv.cpp new file mode 100644 index 00000000..491708ec --- /dev/null +++ b/test/correctness/routines/level2/xtbmv.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xtbmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "STBMV"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DTBMV"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTBMV"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTBMV"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HTBMV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtbsv.cc b/test/correctness/routines/level2/xtbsv.cc deleted file mode 100644 index 12b5dca5..00000000 --- a/test/correctness/routines/level2/xtbsv.cc +++ /dev/null @@ -1,29 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xtbsv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "STBSV"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DTBSV"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTBSV"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTBSV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xtbsv.cpp b/test/correctness/routines/level2/xtbsv.cpp new file mode 100644 index 00000000..12b5dca5 --- /dev/null +++ b/test/correctness/routines/level2/xtbsv.cpp @@ -0,0 +1,29 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xtbsv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "STBSV"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DTBSV"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTBSV"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTBSV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtpmv.cc b/test/correctness/routines/level2/xtpmv.cc deleted file mode 100644 index b89f0adc..00000000 --- a/test/correctness/routines/level2/xtpmv.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xtpmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "STPMV"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DTPMV"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTPMV"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTPMV"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HTPMV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xtpmv.cpp b/test/correctness/routines/level2/xtpmv.cpp new file mode 100644 index 00000000..b89f0adc --- /dev/null +++ b/test/correctness/routines/level2/xtpmv.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xtpmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "STPMV"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DTPMV"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTPMV"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTPMV"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HTPMV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtpsv.cc b/test/correctness/routines/level2/xtpsv.cc deleted file mode 100644 index 6e6e7c85..00000000 --- a/test/correctness/routines/level2/xtpsv.cc +++ /dev/null @@ -1,29 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xtpsv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "STPSV"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DTPSV"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTPSV"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTPSV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xtpsv.cpp b/test/correctness/routines/level2/xtpsv.cpp new file mode 100644 index 00000000..6e6e7c85 --- /dev/null +++ b/test/correctness/routines/level2/xtpsv.cpp @@ -0,0 +1,29 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xtpsv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "STPSV"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DTPSV"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTPSV"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTPSV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtrmv.cc b/test/correctness/routines/level2/xtrmv.cc deleted file mode 100644 index 819f5cad..00000000 --- a/test/correctness/routines/level2/xtrmv.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xtrmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "STRMV"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DTRMV"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTRMV"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTRMV"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HTRMV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xtrmv.cpp b/test/correctness/routines/level2/xtrmv.cpp new file mode 100644 index 00000000..819f5cad --- /dev/null +++ b/test/correctness/routines/level2/xtrmv.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xtrmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "STRMV"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DTRMV"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTRMV"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTRMV"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HTRMV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtrsv.cc b/test/correctness/routines/level2/xtrsv.cc deleted file mode 100644 index 78e33807..00000000 --- a/test/correctness/routines/level2/xtrsv.cc +++ /dev/null @@ -1,29 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level2/xtrsv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "STRSV"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DTRSV"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTRSV"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTRSV"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level2/xtrsv.cpp b/test/correctness/routines/level2/xtrsv.cpp new file mode 100644 index 00000000..78e33807 --- /dev/null +++ b/test/correctness/routines/level2/xtrsv.cpp @@ -0,0 +1,29 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level2/xtrsv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "STRSV"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DTRSV"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTRSV"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTRSV"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xgemm.cc b/test/correctness/routines/level3/xgemm.cc deleted file mode 100644 index 54d41719..00000000 --- a/test/correctness/routines/level3/xgemm.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level3/xgemm.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SGEMM"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DGEMM"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CGEMM"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZGEMM"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HGEMM"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level3/xgemm.cpp b/test/correctness/routines/level3/xgemm.cpp new file mode 100644 index 00000000..54d41719 --- /dev/null +++ b/test/correctness/routines/level3/xgemm.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level3/xgemm.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SGEMM"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DGEMM"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CGEMM"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZGEMM"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HGEMM"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xhemm.cc b/test/correctness/routines/level3/xhemm.cc deleted file mode 100644 index 76c970a7..00000000 --- a/test/correctness/routines/level3/xhemm.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level3/xhemm.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float2>(argc, argv, false, "CHEMM"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZHEMM"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level3/xhemm.cpp b/test/correctness/routines/level3/xhemm.cpp new file mode 100644 index 00000000..76c970a7 --- /dev/null +++ b/test/correctness/routines/level3/xhemm.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level3/xhemm.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float2>(argc, argv, false, "CHEMM"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZHEMM"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xher2k.cc b/test/correctness/routines/level3/xher2k.cc deleted file mode 100644 index c653265e..00000000 --- a/test/correctness/routines/level3/xher2k.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level3/xher2k.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float>(argc, argv, false, "CHER2K"); - errors += clblast::RunTests, double2, double>(argc, argv, true, "ZHER2K"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level3/xher2k.cpp b/test/correctness/routines/level3/xher2k.cpp new file mode 100644 index 00000000..c653265e --- /dev/null +++ b/test/correctness/routines/level3/xher2k.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level3/xher2k.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float>(argc, argv, false, "CHER2K"); + errors += clblast::RunTests, double2, double>(argc, argv, true, "ZHER2K"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xherk.cc b/test/correctness/routines/level3/xherk.cc deleted file mode 100644 index 09ea9e4d..00000000 --- a/test/correctness/routines/level3/xherk.cc +++ /dev/null @@ -1,27 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level3/xherk.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float2, float>(argc, argv, false, "CHERK"); - errors += clblast::RunTests, double2, double>(argc, argv, true, "ZHERK"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level3/xherk.cpp b/test/correctness/routines/level3/xherk.cpp new file mode 100644 index 00000000..09ea9e4d --- /dev/null +++ b/test/correctness/routines/level3/xherk.cpp @@ -0,0 +1,27 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level3/xherk.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float2, float>(argc, argv, false, "CHERK"); + errors += clblast::RunTests, double2, double>(argc, argv, true, "ZHERK"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xsymm.cc b/test/correctness/routines/level3/xsymm.cc deleted file mode 100644 index 3cb3515a..00000000 --- a/test/correctness/routines/level3/xsymm.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level3/xsymm.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SSYMM"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DSYMM"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CSYMM"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZSYMM"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HSYMM"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level3/xsymm.cpp b/test/correctness/routines/level3/xsymm.cpp new file mode 100644 index 00000000..3cb3515a --- /dev/null +++ b/test/correctness/routines/level3/xsymm.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level3/xsymm.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SSYMM"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DSYMM"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CSYMM"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZSYMM"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HSYMM"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xsyr2k.cc b/test/correctness/routines/level3/xsyr2k.cc deleted file mode 100644 index 617af04d..00000000 --- a/test/correctness/routines/level3/xsyr2k.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level3/xsyr2k.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SSYR2K"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DSYR2K"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CSYR2K"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZSYR2K"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HSYR2K"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level3/xsyr2k.cpp b/test/correctness/routines/level3/xsyr2k.cpp new file mode 100644 index 00000000..617af04d --- /dev/null +++ b/test/correctness/routines/level3/xsyr2k.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level3/xsyr2k.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SSYR2K"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DSYR2K"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CSYR2K"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZSYR2K"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HSYR2K"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xsyrk.cc b/test/correctness/routines/level3/xsyrk.cc deleted file mode 100644 index 2014b8d0..00000000 --- a/test/correctness/routines/level3/xsyrk.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level3/xsyrk.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SSYRK"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DSYRK"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CSYRK"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZSYRK"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HSYRK"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level3/xsyrk.cpp b/test/correctness/routines/level3/xsyrk.cpp new file mode 100644 index 00000000..2014b8d0 --- /dev/null +++ b/test/correctness/routines/level3/xsyrk.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level3/xsyrk.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SSYRK"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DSYRK"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CSYRK"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZSYRK"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HSYRK"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xtrmm.cc b/test/correctness/routines/level3/xtrmm.cc deleted file mode 100644 index 32640d52..00000000 --- a/test/correctness/routines/level3/xtrmm.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level3/xtrmm.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "STRMM"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DTRMM"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTRMM"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTRMM"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HTRMM"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level3/xtrmm.cpp b/test/correctness/routines/level3/xtrmm.cpp new file mode 100644 index 00000000..32640d52 --- /dev/null +++ b/test/correctness/routines/level3/xtrmm.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level3/xtrmm.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "STRMM"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DTRMM"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTRMM"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTRMM"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HTRMM"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xtrsm.cc b/test/correctness/routines/level3/xtrsm.cc deleted file mode 100644 index 6119bd17..00000000 --- a/test/correctness/routines/level3/xtrsm.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/level3/xtrsm.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "STRSM"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DTRSM"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTRSM"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTRSM"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HTRSM"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/level3/xtrsm.cpp b/test/correctness/routines/level3/xtrsm.cpp new file mode 100644 index 00000000..6119bd17 --- /dev/null +++ b/test/correctness/routines/level3/xtrsm.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/level3/xtrsm.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "STRSM"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DTRSM"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTRSM"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTRSM"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HTRSM"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/routines/levelx/xomatcopy.cc b/test/correctness/routines/levelx/xomatcopy.cc deleted file mode 100644 index e034bc18..00000000 --- a/test/correctness/routines/levelx/xomatcopy.cc +++ /dev/null @@ -1,30 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/correctness/testblas.hpp" -#include "test/routines/levelx/xomatcopy.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::RunTests, float, float>(argc, argv, false, "SOMATCOPY"); - errors += clblast::RunTests, double, double>(argc, argv, true, "DOMATCOPY"); - errors += clblast::RunTests, float2, float2>(argc, argv, true, "COMATCOPY"); - errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZOMATCOPY"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HOMATCOPY"); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= diff --git a/test/correctness/routines/levelx/xomatcopy.cpp b/test/correctness/routines/levelx/xomatcopy.cpp new file mode 100644 index 00000000..e034bc18 --- /dev/null +++ b/test/correctness/routines/levelx/xomatcopy.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/levelx/xomatcopy.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SOMATCOPY"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DOMATCOPY"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "COMATCOPY"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZOMATCOPY"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HOMATCOPY"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc deleted file mode 100644 index cec8bafa..00000000 --- a/test/correctness/testblas.cc +++ /dev/null @@ -1,244 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the TestBlas class (see the header for information about the class). -// -// ================================================================================================= - -#include -#include - -#include "test/correctness/testblas.hpp" - -namespace clblast { -// ================================================================================================= - -// The transpose-options to test with (data-type dependent) -template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes}; -template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes}; -template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes}; -template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate}; -template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate}; -template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kConjugate}; -template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kConjugate}; - -// ================================================================================================= - -// Constructor, initializes the base class tester and input data -template -TestBlas::TestBlas(int argc, char *argv[], const bool silent, - const std::string &name, const std::vector &options, - const Routine run_routine, - const Routine run_reference1, const Routine run_reference2, - const ResultGet get_result, const ResultIndex get_index, - const ResultIterator get_id1, const ResultIterator get_id2): - Tester(argc, argv, silent, name, options), - run_routine_(run_routine), - get_result_(get_result), - get_index_(get_index), - get_id1_(get_id1), - get_id2_(get_id2) { - - // Sets the reference to test against - if (compare_clblas_) { run_reference_ = run_reference1; } - else if (compare_cblas_) { run_reference_ = run_reference2; } - else { throw std::runtime_error("Invalid configuration: no reference to test against"); } - - // Computes the maximum sizes. This allows for a single set of input/output buffers. - auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end()); - auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end()); - auto max_mat = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); - auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); - auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end()); - auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end()); - - // Creates test input data - x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset); - y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset); - a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); - b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); - c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); - ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset); - scalar_source_.resize(std::max(max_mat, max_matvec) + max_offset); - PopulateVector(x_source_); - PopulateVector(y_source_); - PopulateVector(a_source_); - PopulateVector(b_source_); - PopulateVector(c_source_); - PopulateVector(ap_source_); - PopulateVector(scalar_source_); -} - -// =============================================================================================== - -// Tests the routine for a wide variety of parameters -template -void TestBlas::TestRegular(std::vector> &test_vector, const std::string &name) { - if (!PrecisionSupported(device_)) { return; } - TestStart("regular behaviour", name); - - // Iterates over all the to-be-tested combinations of arguments - for (auto &args: test_vector) { - - // Prints the current test configuration - if (verbose_) { - fprintf(stdout, " Config: %s-> ", GetOptionsString(args).c_str()); - } - - // Runs the CLBlast code - auto x_vec2 = Buffer(context_, args.x_size); - auto y_vec2 = Buffer(context_, args.y_size); - auto a_mat2 = Buffer(context_, args.a_size); - auto b_mat2 = Buffer(context_, args.b_size); - auto c_mat2 = Buffer(context_, args.c_size); - auto ap_mat2 = Buffer(context_, args.ap_size); - auto scalar2 = Buffer(context_, args.scalar_size); - x_vec2.Write(queue_, args.x_size, x_source_); - y_vec2.Write(queue_, args.y_size, y_source_); - a_mat2.Write(queue_, args.a_size, a_source_); - b_mat2.Write(queue_, args.b_size, b_source_); - c_mat2.Write(queue_, args.c_size, c_source_); - ap_mat2.Write(queue_, args.ap_size, ap_source_); - scalar2.Write(queue_, args.scalar_size, scalar_source_); - auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; - auto status2 = run_routine_(args, buffers2, queue_); - - // Don't continue with CBLAS if there are incorrect parameters - if (compare_cblas_ && status2 != StatusCode::kSuccess) { - TestErrorCodes(status2, status2, args); - continue; - } - - // Runs the reference BLAS code - auto x_vec1 = Buffer(context_, args.x_size); - auto y_vec1 = Buffer(context_, args.y_size); - auto a_mat1 = Buffer(context_, args.a_size); - auto b_mat1 = Buffer(context_, args.b_size); - auto c_mat1 = Buffer(context_, args.c_size); - auto ap_mat1 = Buffer(context_, args.ap_size); - auto scalar1 = Buffer(context_, args.scalar_size); - x_vec1.Write(queue_, args.x_size, x_source_); - y_vec1.Write(queue_, args.y_size, y_source_); - a_mat1.Write(queue_, args.a_size, a_source_); - b_mat1.Write(queue_, args.b_size, b_source_); - c_mat1.Write(queue_, args.c_size, c_source_); - ap_mat1.Write(queue_, args.ap_size, ap_source_); - scalar1.Write(queue_, args.scalar_size, scalar_source_); - auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; - auto status1 = run_reference_(args, buffers1, queue_); - - // Tests for equality of the two status codes - if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) { - TestErrorCodes(status1, status2, args); - continue; - } - - // Downloads the results - auto result1 = get_result_(args, buffers1, queue_); - auto result2 = get_result_(args, buffers2, queue_); - - // Checks for differences in the output - auto errors = size_t{0}; - for (auto id1=size_t{0}; id1 0) { fprintf(stdout, "\n "); } - - // Tests the error count (should be zero) - TestErrorCount(errors, get_id1_(args)*get_id2_(args), args); - } - TestEnd(); -} - -// ================================================================================================= - -// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types, -// does not test for results (if any). -template -void TestBlas::TestInvalid(std::vector> &test_vector, const std::string &name) { - if (!PrecisionSupported(device_)) { return; } - if (!compare_clblas_) { return; } // not supported for CPU BLAS routines - if (std::is_same::value) { return; } // not supported for half-precision - TestStart("invalid buffer sizes", name); - - // Iterates over all the to-be-tested combinations of arguments - for (auto &args: test_vector) { - - // Prints the current test configuration - if (verbose_) { - fprintf(stdout, " Config: %s-> ", GetSizesString(args).c_str()); - } - - // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly - // want to be able to create invalid buffers (no error checking here). - auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); - auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); - auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); - auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); - auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); - auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); - auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); - auto x_vec1 = Buffer(x1); - auto y_vec1 = Buffer(y1); - auto a_mat1 = Buffer(a1); - auto b_mat1 = Buffer(b1); - auto c_mat1 = Buffer(c1); - auto ap_mat1 = Buffer(ap1); - auto scalar1 = Buffer(d1); - auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); - auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); - auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); - auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); - auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); - auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); - auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); - auto x_vec2 = Buffer(x2); - auto y_vec2 = Buffer(y2); - auto a_mat2 = Buffer(a2); - auto b_mat2 = Buffer(b2); - auto c_mat2 = Buffer(c2); - auto ap_mat2 = Buffer(ap2); - auto scalar2 = Buffer(d2); - - // Runs the two routines - auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; - auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; - auto status1 = run_reference_(args, buffers1, queue_); - auto status2 = run_routine_(args, buffers2, queue_); - - // Tests for equality of the two status codes - TestErrorCodes(status1, status2, args); - } - TestEnd(); -} - -// ================================================================================================= - -// Compiles the templated class -template class TestBlas; -template class TestBlas; -template class TestBlas; -template class TestBlas; -template class TestBlas; -template class TestBlas; -template class TestBlas; - -// ================================================================================================= -} // namespace clblast diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp new file mode 100644 index 00000000..cec8bafa --- /dev/null +++ b/test/correctness/testblas.cpp @@ -0,0 +1,244 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the TestBlas class (see the header for information about the class). +// +// ================================================================================================= + +#include +#include + +#include "test/correctness/testblas.hpp" + +namespace clblast { +// ================================================================================================= + +// The transpose-options to test with (data-type dependent) +template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes}; +template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes}; +template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes}; +template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate}; +template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate}; +template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kConjugate}; +template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kConjugate}; + +// ================================================================================================= + +// Constructor, initializes the base class tester and input data +template +TestBlas::TestBlas(int argc, char *argv[], const bool silent, + const std::string &name, const std::vector &options, + const Routine run_routine, + const Routine run_reference1, const Routine run_reference2, + const ResultGet get_result, const ResultIndex get_index, + const ResultIterator get_id1, const ResultIterator get_id2): + Tester(argc, argv, silent, name, options), + run_routine_(run_routine), + get_result_(get_result), + get_index_(get_index), + get_id1_(get_id1), + get_id2_(get_id2) { + + // Sets the reference to test against + if (compare_clblas_) { run_reference_ = run_reference1; } + else if (compare_cblas_) { run_reference_ = run_reference2; } + else { throw std::runtime_error("Invalid configuration: no reference to test against"); } + + // Computes the maximum sizes. This allows for a single set of input/output buffers. + auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end()); + auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end()); + auto max_mat = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); + auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); + auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end()); + auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end()); + + // Creates test input data + x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset); + y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset); + a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); + b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); + c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); + ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset); + scalar_source_.resize(std::max(max_mat, max_matvec) + max_offset); + PopulateVector(x_source_); + PopulateVector(y_source_); + PopulateVector(a_source_); + PopulateVector(b_source_); + PopulateVector(c_source_); + PopulateVector(ap_source_); + PopulateVector(scalar_source_); +} + +// =============================================================================================== + +// Tests the routine for a wide variety of parameters +template +void TestBlas::TestRegular(std::vector> &test_vector, const std::string &name) { + if (!PrecisionSupported(device_)) { return; } + TestStart("regular behaviour", name); + + // Iterates over all the to-be-tested combinations of arguments + for (auto &args: test_vector) { + + // Prints the current test configuration + if (verbose_) { + fprintf(stdout, " Config: %s-> ", GetOptionsString(args).c_str()); + } + + // Runs the CLBlast code + auto x_vec2 = Buffer(context_, args.x_size); + auto y_vec2 = Buffer(context_, args.y_size); + auto a_mat2 = Buffer(context_, args.a_size); + auto b_mat2 = Buffer(context_, args.b_size); + auto c_mat2 = Buffer(context_, args.c_size); + auto ap_mat2 = Buffer(context_, args.ap_size); + auto scalar2 = Buffer(context_, args.scalar_size); + x_vec2.Write(queue_, args.x_size, x_source_); + y_vec2.Write(queue_, args.y_size, y_source_); + a_mat2.Write(queue_, args.a_size, a_source_); + b_mat2.Write(queue_, args.b_size, b_source_); + c_mat2.Write(queue_, args.c_size, c_source_); + ap_mat2.Write(queue_, args.ap_size, ap_source_); + scalar2.Write(queue_, args.scalar_size, scalar_source_); + auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; + auto status2 = run_routine_(args, buffers2, queue_); + + // Don't continue with CBLAS if there are incorrect parameters + if (compare_cblas_ && status2 != StatusCode::kSuccess) { + TestErrorCodes(status2, status2, args); + continue; + } + + // Runs the reference BLAS code + auto x_vec1 = Buffer(context_, args.x_size); + auto y_vec1 = Buffer(context_, args.y_size); + auto a_mat1 = Buffer(context_, args.a_size); + auto b_mat1 = Buffer(context_, args.b_size); + auto c_mat1 = Buffer(context_, args.c_size); + auto ap_mat1 = Buffer(context_, args.ap_size); + auto scalar1 = Buffer(context_, args.scalar_size); + x_vec1.Write(queue_, args.x_size, x_source_); + y_vec1.Write(queue_, args.y_size, y_source_); + a_mat1.Write(queue_, args.a_size, a_source_); + b_mat1.Write(queue_, args.b_size, b_source_); + c_mat1.Write(queue_, args.c_size, c_source_); + ap_mat1.Write(queue_, args.ap_size, ap_source_); + scalar1.Write(queue_, args.scalar_size, scalar_source_); + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; + auto status1 = run_reference_(args, buffers1, queue_); + + // Tests for equality of the two status codes + if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) { + TestErrorCodes(status1, status2, args); + continue; + } + + // Downloads the results + auto result1 = get_result_(args, buffers1, queue_); + auto result2 = get_result_(args, buffers2, queue_); + + // Checks for differences in the output + auto errors = size_t{0}; + for (auto id1=size_t{0}; id1 0) { fprintf(stdout, "\n "); } + + // Tests the error count (should be zero) + TestErrorCount(errors, get_id1_(args)*get_id2_(args), args); + } + TestEnd(); +} + +// ================================================================================================= + +// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types, +// does not test for results (if any). +template +void TestBlas::TestInvalid(std::vector> &test_vector, const std::string &name) { + if (!PrecisionSupported(device_)) { return; } + if (!compare_clblas_) { return; } // not supported for CPU BLAS routines + if (std::is_same::value) { return; } // not supported for half-precision + TestStart("invalid buffer sizes", name); + + // Iterates over all the to-be-tested combinations of arguments + for (auto &args: test_vector) { + + // Prints the current test configuration + if (verbose_) { + fprintf(stdout, " Config: %s-> ", GetSizesString(args).c_str()); + } + + // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly + // want to be able to create invalid buffers (no error checking here). + auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); + auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); + auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); + auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); + auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); + auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); + auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); + auto x_vec1 = Buffer(x1); + auto y_vec1 = Buffer(y1); + auto a_mat1 = Buffer(a1); + auto b_mat1 = Buffer(b1); + auto c_mat1 = Buffer(c1); + auto ap_mat1 = Buffer(ap1); + auto scalar1 = Buffer(d1); + auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); + auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); + auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); + auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); + auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); + auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); + auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); + auto x_vec2 = Buffer(x2); + auto y_vec2 = Buffer(y2); + auto a_mat2 = Buffer(a2); + auto b_mat2 = Buffer(b2); + auto c_mat2 = Buffer(c2); + auto ap_mat2 = Buffer(ap2); + auto scalar2 = Buffer(d2); + + // Runs the two routines + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; + auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; + auto status1 = run_reference_(args, buffers1, queue_); + auto status2 = run_routine_(args, buffers2, queue_); + + // Tests for equality of the two status codes + TestErrorCodes(status1, status2, args); + } + TestEnd(); +} + +// ================================================================================================= + +// Compiles the templated class +template class TestBlas; +template class TestBlas; +template class TestBlas; +template class TestBlas; +template class TestBlas; +template class TestBlas; +template class TestBlas; + +// ================================================================================================= +} // namespace clblast diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc deleted file mode 100644 index 92e2c1b8..00000000 --- a/test/correctness/tester.cc +++ /dev/null @@ -1,441 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the Tester class (see the header for information about the class). -// -// ================================================================================================= - -#include -#include -#include -#include - -#include "test/correctness/tester.hpp" - -namespace clblast { -// ================================================================================================= - -// General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up -// the clBLAS library for reference. -template -Tester::Tester(int argc, char *argv[], const bool silent, - const std::string &name, const std::vector &options): - help_("Options given/available:\n"), - platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))), - device_(Device(platform_, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))), - context_(Context(device_)), - queue_(Queue(context_, device_)), - full_test_(CheckArgument(argc, argv, help_, kArgFullTest)), - verbose_(CheckArgument(argc, argv, help_, kArgVerbose)), - error_log_{}, - num_passed_{0}, - num_skipped_{0}, - num_failed_{0}, - print_count_{0}, - tests_passed_{0}, - tests_skipped_{0}, - tests_failed_{0}, - options_{options} { - - // Determines which reference to test against - #if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS) - compare_clblas_ = GetArgument(argc, argv, help_, kArgCompareclblas, 0); - compare_cblas_ = GetArgument(argc, argv, help_, kArgComparecblas, 1); - #elif CLBLAST_REF_CLBLAS - compare_clblas_ = GetArgument(argc, argv, help_, kArgCompareclblas, 1); - compare_cblas_ = 0; - #elif CLBLAST_REF_CBLAS - compare_clblas_ = 0; - compare_cblas_ = GetArgument(argc, argv, help_, kArgComparecblas, 1); - #else - compare_clblas_ = 0; - compare_cblas_ = 0; - #endif - - // Prints the help message (command-line arguments) - if (!silent) { fprintf(stdout, "\n* %s\n", help_.c_str()); } - - // Can only test against a single reference (not two, not zero) - if (compare_clblas_ && compare_cblas_) { - throw std::runtime_error("Cannot test against both clBLAS and CBLAS references; choose one using the -cblas and -clblas arguments"); - } - if (!compare_clblas_ && !compare_cblas_) { - throw std::runtime_error("Choose one reference (clBLAS or CBLAS) to test against using the -cblas and -clblas arguments"); - } - - // Prints the header - fprintf(stdout, "* Running on OpenCL device '%s'.\n", device_.Name().c_str()); - fprintf(stdout, "* Starting tests for the %s'%s'%s routine.", - kPrintMessage.c_str(), name.c_str(), kPrintEnd.c_str()); - - // Checks whether the precision is supported - if (!PrecisionSupported(device_)) { - fprintf(stdout, "\n* All tests skipped: %sUnsupported precision%s\n", - kPrintWarning.c_str(), kPrintEnd.c_str()); - return; - } - - // Prints the legend - fprintf(stdout, " Legend:\n"); - fprintf(stdout, " %s -> Test produced correct results\n", kSuccessData.c_str()); - fprintf(stdout, " %s -> Test returned the correct error code\n", kSuccessStatus.c_str()); - fprintf(stdout, " %s -> Test produced incorrect results\n", kErrorData.c_str()); - fprintf(stdout, " %s -> Test returned an incorrect error code\n", kErrorStatus.c_str()); - fprintf(stdout, " %s -> Test not executed: OpenCL-kernel compilation error\n", - kSkippedCompilation.c_str()); - fprintf(stdout, " %s -> Test not executed: Unsupported precision\n", - kUnsupportedPrecision.c_str()); - fprintf(stdout, " %s -> Test not completed: Reference CBLAS doesn't output error codes\n", - kUnsupportedReference.c_str()); - - // Initializes clBLAS - #ifdef CLBLAST_REF_CLBLAS - if (compare_clblas_) { - auto status = clblasSetup(); - if (status != CL_SUCCESS) { - throw std::runtime_error("clBLAS setup error: "+ToString(static_cast(status))); - } - } - #endif -} - -// Destructor prints the summary of the test cases and cleans-up the clBLAS library -template -Tester::~Tester() { - if (PrecisionSupported(device_)) { - fprintf(stdout, "* Completed all test-cases for this routine. Results:\n"); - fprintf(stdout, " %zu test(s) passed\n", tests_passed_); - if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); } - fprintf(stdout, " %zu test(s) skipped%s\n", tests_skipped_, kPrintEnd.c_str()); - if (tests_failed_ > 0) { fprintf(stdout, "%s", kPrintError.c_str()); } - fprintf(stdout, " %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str()); - } - fprintf(stdout, "\n"); - - // Cleans-up clBLAS - #ifdef CLBLAST_REF_CLBLAS - if (compare_clblas_) { - clblasTeardown(); - } - #endif -} - -// ================================================================================================= - -// Function called at the start of each test. This prints a header with information about the -// test and re-initializes all test data-structures. -template -void Tester::TestStart(const std::string &test_name, const std::string &test_configuration) { - - // Prints the header - fprintf(stdout, "* Testing %s'%s'%s for %s'%s'%s:\n", - kPrintMessage.c_str(), test_name.c_str(), kPrintEnd.c_str(), - kPrintMessage.c_str(), test_configuration.c_str(), kPrintEnd.c_str()); - if (!verbose_) { fprintf(stdout, " "); } - - // Empties the error log and the error/pass counters - error_log_.clear(); - num_passed_ = 0; - num_skipped_ = 0; - num_failed_ = 0; - print_count_ = 0; -} - -// Function called at the end of each test. This prints errors if any occured. It also prints a -// summary of the number of sub-tests passed/failed. -template -void Tester::TestEnd() { - if (!verbose_) { fprintf(stdout, "\n"); } - tests_passed_ += num_passed_; - tests_skipped_ += num_skipped_; - tests_failed_ += num_failed_; - - // Prints the errors - PrintErrorLog(error_log_); - - // Prints a test summary - auto pass_rate = 100*num_passed_ / static_cast(num_passed_ + num_skipped_ + num_failed_); - fprintf(stdout, " Pass rate %s%5.1lf%%%s:", kPrintMessage.c_str(), pass_rate, kPrintEnd.c_str()); - fprintf(stdout, " %zu passed /", num_passed_); - if (num_skipped_ != 0) { - fprintf(stdout, " %s%zu skipped%s /", kPrintWarning.c_str(), num_skipped_, kPrintEnd.c_str()); - } - else { - fprintf(stdout, " %zu skipped /", num_skipped_); - } - if (num_failed_ != 0) { - fprintf(stdout, " %s%zu failed%s\n", kPrintError.c_str(), num_failed_, kPrintEnd.c_str()); - } - else { - fprintf(stdout, " %zu failed\n", num_failed_); - } -} - -// ================================================================================================= - -// Handles a 'pass' or 'error' depending on whether there are any errors -template -void Tester::TestErrorCount(const size_t errors, const size_t size, const Arguments &args) { - - // Finished successfully - if (errors == 0) { - PrintTestResult(kSuccessData); - ReportPass(); - } - - // Error(s) occurred - else { - auto percentage = 100*errors / static_cast(size); - PrintTestResult(kErrorData); - ReportError({StatusCode::kSuccess, StatusCode::kSuccess, percentage, args}); - } -} - -// Compares two status codes for equality. The outcome can be a pass (they are the same), a warning -// (CLBlast reported a compilation error), or an error (they are different). -template -void Tester::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status, - const Arguments &args) { - - // Cannot compare error codes against a library other than clBLAS - if (compare_cblas_) { - PrintTestResult(kUnsupportedReference); - ReportSkipped(); - } - - // Finished successfully - else if (clblas_status == clblast_status) { - PrintTestResult(kSuccessStatus); - ReportPass(); - } - - // No support for this kind of precision - else if (clblast_status == StatusCode::kNoDoublePrecision || - clblast_status == StatusCode::kNoHalfPrecision) { - PrintTestResult(kUnsupportedPrecision); - ReportSkipped(); - } - - // Could not compile the CLBlast kernel properly - else if (clblast_status == StatusCode::kBuildProgramFailure || - clblast_status == StatusCode::kNotImplemented) { - PrintTestResult(kSkippedCompilation); - ReportSkipped(); - } - - // Error occurred - else { - PrintTestResult(kErrorStatus); - ReportError({clblas_status, clblast_status, kStatusError, args}); - if (verbose_) { - fprintf(stdout, "\n"); - PrintErrorLog({{clblas_status, clblast_status, kStatusError, args}}); - fprintf(stdout, " "); - } - } -} - -// ================================================================================================= - -// Retrieves the offset values to test with -template -const std::vector Tester::GetOffsets() const { - if (full_test_) { return {0, 10}; } - else { return {0}; } -} - -// Retrieves the options as a string for a specific test -template -std::string Tester::GetOptionsString(const Arguments &args) { - auto result = std::string(""); - const auto equals = std::string("="); - for (auto &o: options_) { - if (o == kArgM) { result += kArgM + equals + ToString(args.m) + " "; } - if (o == kArgN) { result += kArgN + equals + ToString(args.n) + " "; } - if (o == kArgK) { result += kArgK + equals + ToString(args.k) + " "; } - if (o == kArgKU) { result += kArgKU + equals + ToString(args.ku) + " "; } - if (o == kArgKL) { result += kArgKL + equals + ToString(args.kl) + " "; } - if (o == kArgXInc) { result += kArgXInc + equals + ToString(args.x_inc) + " "; } - if (o == kArgYInc) { result += kArgYInc + equals + ToString(args.y_inc) + " "; } - if (o == kArgXOffset) { result += kArgXOffset + equals + ToString(args.x_offset) + " "; } - if (o == kArgYOffset) { result += kArgYOffset + equals + ToString(args.y_offset) + " "; } - if (o == kArgALeadDim) { result += kArgALeadDim + equals + ToString(args.a_ld) + " "; } - if (o == kArgBLeadDim) { result += kArgBLeadDim + equals + ToString(args.b_ld) + " "; } - if (o == kArgCLeadDim) { result += kArgCLeadDim + equals + ToString(args.c_ld) + " "; } - if (o == kArgAOffset) { result += kArgAOffset + equals + ToString(args.a_offset) + " "; } - if (o == kArgBOffset) { result += kArgBOffset + equals + ToString(args.b_offset) + " "; } - if (o == kArgCOffset) { result += kArgCOffset + equals + ToString(args.c_offset) + " "; } - if (o == kArgAPOffset) { result += kArgAPOffset + equals + ToString(args.ap_offset) + " "; } - if (o == kArgDotOffset){ result += kArgDotOffset + equals + ToString(args.dot_offset) + " "; } - } - return result; -} - -// As above, but now only prints information relevant to invalid buffer sizes -template -std::string Tester::GetSizesString(const Arguments &args) { - auto result = std::string(""); - const auto equals = std::string("="); - for (auto &o: options_) { - if (o == kArgM) { result += kArgM + equals + ToString(args.m) + " "; } - if (o == kArgN) { result += kArgN + equals + ToString(args.n) + " "; } - if (o == kArgK) { result += kArgK + equals + ToString(args.k) + " "; } - if (o == kArgXOffset) { result += "xsize" + equals + ToString(args.x_size) + " "; } - if (o == kArgYOffset) { result += "ysize" + equals + ToString(args.y_size) + " "; } - if (o == kArgAOffset) { result += "asize" + equals + ToString(args.a_size) + " "; } - if (o == kArgBOffset) { result += "bsize" + equals + ToString(args.b_size) + " "; } - if (o == kArgCOffset) { result += "csize" + equals + ToString(args.c_size) + " "; } - if (o == kArgAPOffset) { result += "apsize" + equals + ToString(args.ap_size) + " "; } - if (o == kArgDotOffset){ result += "scalarsize" + equals + ToString(args.scalar_size) + " "; } - } - return result; -} - -// ================================================================================================= - -// A test can either pass, be skipped, or fail -template -void Tester::ReportPass() { - num_passed_++; -} -template -void Tester::ReportSkipped() { - num_skipped_++; -} -template -void Tester::ReportError(const ErrorLogEntry &error_log_entry) { - error_log_.push_back(error_log_entry); - num_failed_++; -} - -// ================================================================================================= - -// Prints the test-result symbol to screen. This function limits the maximum number of symbols per -// line by printing newlines once every so many calls. -template -void Tester::PrintTestResult(const std::string &message) { - if (verbose_) { - fprintf(stdout, "%s\n", message.c_str()); - } - else - { - if (print_count_ == kResultsPerLine) { - print_count_ = 0; - fprintf(stdout, "\n "); - } - fprintf(stdout, "%s", message.c_str()); - print_count_++; - } - std::cout << std::flush; -} - -// Prints details of errors occurred in a given error log -template -void Tester::PrintErrorLog(const std::vector &error_log) { - for (auto &entry: error_log) { - if (entry.error_percentage != kStatusError) { - fprintf(stdout, " Error rate %.1lf%%: ", entry.error_percentage); - } - else { - fprintf(stdout, " Status code %d (expected %d): ", entry.status_found, entry.status_expect); - } - fprintf(stdout, "%s\n", GetOptionsString(entry.args).c_str()); - } -} - -// ================================================================================================= -// Below are the non-member functions (separated because of otherwise required partial class -// template specialization) -// ================================================================================================= - -// Compares two floating point values and returns whether they are within an acceptable error -// margin. This replaces GTest's EXPECT_NEAR(). -template -bool TestSimilarity(const T val1, const T val2) { - const auto difference = std::fabs(val1 - val2); - - // Set the allowed error margin for floating-point comparisons - constexpr auto kErrorMarginRelative = T(0.025); - constexpr auto kErrorMarginAbsolute = T(1.0e-3); - - // Shortcut, handles infinities - if (val1 == val2) { - return true; - } - // The values are zero or very small: the relative error is less meaningful - else if (val1 == 0 || val2 == 0 || difference < kErrorMarginAbsolute) { - return (difference < kErrorMarginAbsolute); - } - // Use relative error - else { - const auto absolute_sum = std::fabs(val1) + std::fabs(val2); - return (difference / absolute_sum) < kErrorMarginRelative; - } -} - -// Compiles the default case for standard data-types -template bool TestSimilarity(const float, const float); -template bool TestSimilarity(const double, const double); - -// Specialisations for non-standard data-types -template <> -bool TestSimilarity(const float2 val1, const float2 val2) { - auto real = TestSimilarity(val1.real(), val2.real()); - auto imag = TestSimilarity(val1.imag(), val2.imag()); - return (real && imag); -} -template <> -bool TestSimilarity(const double2 val1, const double2 val2) { - auto real = TestSimilarity(val1.real(), val2.real()); - auto imag = TestSimilarity(val1.imag(), val2.imag()); - return (real && imag); -} -template <> -bool TestSimilarity(const half val1, const half val2) { - return TestSimilarity(HalfToFloat(val1), HalfToFloat(val2)); -} - -// ================================================================================================= - -// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various -// routines. This function is specialised for the different data-types. -template <> const std::vector GetExampleScalars(const bool full_test) { - if (full_test) { return {0.0f, 1.0f, 3.14f}; } - else { return {3.14f}; } -} -template <> const std::vector GetExampleScalars(const bool full_test) { - if (full_test) { return {0.0, 1.0, 3.14}; } - else { return {3.14}; } -} -template <> const std::vector GetExampleScalars(const bool full_test) { - if (full_test) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; } - else { return {{2.42f, 3.14f}}; } -} -template <> const std::vector GetExampleScalars(const bool full_test) { - if (full_test) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; } - else { return {{2.42, 3.14}}; } -} -template <> const std::vector GetExampleScalars(const bool full_test) { - if (full_test) { return {FloatToHalf(0.0f), FloatToHalf(1.0f), FloatToHalf(3.14f)}; } - else { return {FloatToHalf(3.14f)}; } -} - -// ================================================================================================= - -// Compiles the templated class -template class Tester; -template class Tester; -template class Tester; -template class Tester; -template class Tester; -template class Tester; -template class Tester; - -// ================================================================================================= -} // namespace clblast diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp new file mode 100644 index 00000000..92e2c1b8 --- /dev/null +++ b/test/correctness/tester.cpp @@ -0,0 +1,441 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Tester class (see the header for information about the class). +// +// ================================================================================================= + +#include +#include +#include +#include + +#include "test/correctness/tester.hpp" + +namespace clblast { +// ================================================================================================= + +// General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up +// the clBLAS library for reference. +template +Tester::Tester(int argc, char *argv[], const bool silent, + const std::string &name, const std::vector &options): + help_("Options given/available:\n"), + platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))), + device_(Device(platform_, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))), + context_(Context(device_)), + queue_(Queue(context_, device_)), + full_test_(CheckArgument(argc, argv, help_, kArgFullTest)), + verbose_(CheckArgument(argc, argv, help_, kArgVerbose)), + error_log_{}, + num_passed_{0}, + num_skipped_{0}, + num_failed_{0}, + print_count_{0}, + tests_passed_{0}, + tests_skipped_{0}, + tests_failed_{0}, + options_{options} { + + // Determines which reference to test against + #if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS) + compare_clblas_ = GetArgument(argc, argv, help_, kArgCompareclblas, 0); + compare_cblas_ = GetArgument(argc, argv, help_, kArgComparecblas, 1); + #elif CLBLAST_REF_CLBLAS + compare_clblas_ = GetArgument(argc, argv, help_, kArgCompareclblas, 1); + compare_cblas_ = 0; + #elif CLBLAST_REF_CBLAS + compare_clblas_ = 0; + compare_cblas_ = GetArgument(argc, argv, help_, kArgComparecblas, 1); + #else + compare_clblas_ = 0; + compare_cblas_ = 0; + #endif + + // Prints the help message (command-line arguments) + if (!silent) { fprintf(stdout, "\n* %s\n", help_.c_str()); } + + // Can only test against a single reference (not two, not zero) + if (compare_clblas_ && compare_cblas_) { + throw std::runtime_error("Cannot test against both clBLAS and CBLAS references; choose one using the -cblas and -clblas arguments"); + } + if (!compare_clblas_ && !compare_cblas_) { + throw std::runtime_error("Choose one reference (clBLAS or CBLAS) to test against using the -cblas and -clblas arguments"); + } + + // Prints the header + fprintf(stdout, "* Running on OpenCL device '%s'.\n", device_.Name().c_str()); + fprintf(stdout, "* Starting tests for the %s'%s'%s routine.", + kPrintMessage.c_str(), name.c_str(), kPrintEnd.c_str()); + + // Checks whether the precision is supported + if (!PrecisionSupported(device_)) { + fprintf(stdout, "\n* All tests skipped: %sUnsupported precision%s\n", + kPrintWarning.c_str(), kPrintEnd.c_str()); + return; + } + + // Prints the legend + fprintf(stdout, " Legend:\n"); + fprintf(stdout, " %s -> Test produced correct results\n", kSuccessData.c_str()); + fprintf(stdout, " %s -> Test returned the correct error code\n", kSuccessStatus.c_str()); + fprintf(stdout, " %s -> Test produced incorrect results\n", kErrorData.c_str()); + fprintf(stdout, " %s -> Test returned an incorrect error code\n", kErrorStatus.c_str()); + fprintf(stdout, " %s -> Test not executed: OpenCL-kernel compilation error\n", + kSkippedCompilation.c_str()); + fprintf(stdout, " %s -> Test not executed: Unsupported precision\n", + kUnsupportedPrecision.c_str()); + fprintf(stdout, " %s -> Test not completed: Reference CBLAS doesn't output error codes\n", + kUnsupportedReference.c_str()); + + // Initializes clBLAS + #ifdef CLBLAST_REF_CLBLAS + if (compare_clblas_) { + auto status = clblasSetup(); + if (status != CL_SUCCESS) { + throw std::runtime_error("clBLAS setup error: "+ToString(static_cast(status))); + } + } + #endif +} + +// Destructor prints the summary of the test cases and cleans-up the clBLAS library +template +Tester::~Tester() { + if (PrecisionSupported(device_)) { + fprintf(stdout, "* Completed all test-cases for this routine. Results:\n"); + fprintf(stdout, " %zu test(s) passed\n", tests_passed_); + if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); } + fprintf(stdout, " %zu test(s) skipped%s\n", tests_skipped_, kPrintEnd.c_str()); + if (tests_failed_ > 0) { fprintf(stdout, "%s", kPrintError.c_str()); } + fprintf(stdout, " %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str()); + } + fprintf(stdout, "\n"); + + // Cleans-up clBLAS + #ifdef CLBLAST_REF_CLBLAS + if (compare_clblas_) { + clblasTeardown(); + } + #endif +} + +// ================================================================================================= + +// Function called at the start of each test. This prints a header with information about the +// test and re-initializes all test data-structures. +template +void Tester::TestStart(const std::string &test_name, const std::string &test_configuration) { + + // Prints the header + fprintf(stdout, "* Testing %s'%s'%s for %s'%s'%s:\n", + kPrintMessage.c_str(), test_name.c_str(), kPrintEnd.c_str(), + kPrintMessage.c_str(), test_configuration.c_str(), kPrintEnd.c_str()); + if (!verbose_) { fprintf(stdout, " "); } + + // Empties the error log and the error/pass counters + error_log_.clear(); + num_passed_ = 0; + num_skipped_ = 0; + num_failed_ = 0; + print_count_ = 0; +} + +// Function called at the end of each test. This prints errors if any occured. It also prints a +// summary of the number of sub-tests passed/failed. +template +void Tester::TestEnd() { + if (!verbose_) { fprintf(stdout, "\n"); } + tests_passed_ += num_passed_; + tests_skipped_ += num_skipped_; + tests_failed_ += num_failed_; + + // Prints the errors + PrintErrorLog(error_log_); + + // Prints a test summary + auto pass_rate = 100*num_passed_ / static_cast(num_passed_ + num_skipped_ + num_failed_); + fprintf(stdout, " Pass rate %s%5.1lf%%%s:", kPrintMessage.c_str(), pass_rate, kPrintEnd.c_str()); + fprintf(stdout, " %zu passed /", num_passed_); + if (num_skipped_ != 0) { + fprintf(stdout, " %s%zu skipped%s /", kPrintWarning.c_str(), num_skipped_, kPrintEnd.c_str()); + } + else { + fprintf(stdout, " %zu skipped /", num_skipped_); + } + if (num_failed_ != 0) { + fprintf(stdout, " %s%zu failed%s\n", kPrintError.c_str(), num_failed_, kPrintEnd.c_str()); + } + else { + fprintf(stdout, " %zu failed\n", num_failed_); + } +} + +// ================================================================================================= + +// Handles a 'pass' or 'error' depending on whether there are any errors +template +void Tester::TestErrorCount(const size_t errors, const size_t size, const Arguments &args) { + + // Finished successfully + if (errors == 0) { + PrintTestResult(kSuccessData); + ReportPass(); + } + + // Error(s) occurred + else { + auto percentage = 100*errors / static_cast(size); + PrintTestResult(kErrorData); + ReportError({StatusCode::kSuccess, StatusCode::kSuccess, percentage, args}); + } +} + +// Compares two status codes for equality. The outcome can be a pass (they are the same), a warning +// (CLBlast reported a compilation error), or an error (they are different). +template +void Tester::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status, + const Arguments &args) { + + // Cannot compare error codes against a library other than clBLAS + if (compare_cblas_) { + PrintTestResult(kUnsupportedReference); + ReportSkipped(); + } + + // Finished successfully + else if (clblas_status == clblast_status) { + PrintTestResult(kSuccessStatus); + ReportPass(); + } + + // No support for this kind of precision + else if (clblast_status == StatusCode::kNoDoublePrecision || + clblast_status == StatusCode::kNoHalfPrecision) { + PrintTestResult(kUnsupportedPrecision); + ReportSkipped(); + } + + // Could not compile the CLBlast kernel properly + else if (clblast_status == StatusCode::kBuildProgramFailure || + clblast_status == StatusCode::kNotImplemented) { + PrintTestResult(kSkippedCompilation); + ReportSkipped(); + } + + // Error occurred + else { + PrintTestResult(kErrorStatus); + ReportError({clblas_status, clblast_status, kStatusError, args}); + if (verbose_) { + fprintf(stdout, "\n"); + PrintErrorLog({{clblas_status, clblast_status, kStatusError, args}}); + fprintf(stdout, " "); + } + } +} + +// ================================================================================================= + +// Retrieves the offset values to test with +template +const std::vector Tester::GetOffsets() const { + if (full_test_) { return {0, 10}; } + else { return {0}; } +} + +// Retrieves the options as a string for a specific test +template +std::string Tester::GetOptionsString(const Arguments &args) { + auto result = std::string(""); + const auto equals = std::string("="); + for (auto &o: options_) { + if (o == kArgM) { result += kArgM + equals + ToString(args.m) + " "; } + if (o == kArgN) { result += kArgN + equals + ToString(args.n) + " "; } + if (o == kArgK) { result += kArgK + equals + ToString(args.k) + " "; } + if (o == kArgKU) { result += kArgKU + equals + ToString(args.ku) + " "; } + if (o == kArgKL) { result += kArgKL + equals + ToString(args.kl) + " "; } + if (o == kArgXInc) { result += kArgXInc + equals + ToString(args.x_inc) + " "; } + if (o == kArgYInc) { result += kArgYInc + equals + ToString(args.y_inc) + " "; } + if (o == kArgXOffset) { result += kArgXOffset + equals + ToString(args.x_offset) + " "; } + if (o == kArgYOffset) { result += kArgYOffset + equals + ToString(args.y_offset) + " "; } + if (o == kArgALeadDim) { result += kArgALeadDim + equals + ToString(args.a_ld) + " "; } + if (o == kArgBLeadDim) { result += kArgBLeadDim + equals + ToString(args.b_ld) + " "; } + if (o == kArgCLeadDim) { result += kArgCLeadDim + equals + ToString(args.c_ld) + " "; } + if (o == kArgAOffset) { result += kArgAOffset + equals + ToString(args.a_offset) + " "; } + if (o == kArgBOffset) { result += kArgBOffset + equals + ToString(args.b_offset) + " "; } + if (o == kArgCOffset) { result += kArgCOffset + equals + ToString(args.c_offset) + " "; } + if (o == kArgAPOffset) { result += kArgAPOffset + equals + ToString(args.ap_offset) + " "; } + if (o == kArgDotOffset){ result += kArgDotOffset + equals + ToString(args.dot_offset) + " "; } + } + return result; +} + +// As above, but now only prints information relevant to invalid buffer sizes +template +std::string Tester::GetSizesString(const Arguments &args) { + auto result = std::string(""); + const auto equals = std::string("="); + for (auto &o: options_) { + if (o == kArgM) { result += kArgM + equals + ToString(args.m) + " "; } + if (o == kArgN) { result += kArgN + equals + ToString(args.n) + " "; } + if (o == kArgK) { result += kArgK + equals + ToString(args.k) + " "; } + if (o == kArgXOffset) { result += "xsize" + equals + ToString(args.x_size) + " "; } + if (o == kArgYOffset) { result += "ysize" + equals + ToString(args.y_size) + " "; } + if (o == kArgAOffset) { result += "asize" + equals + ToString(args.a_size) + " "; } + if (o == kArgBOffset) { result += "bsize" + equals + ToString(args.b_size) + " "; } + if (o == kArgCOffset) { result += "csize" + equals + ToString(args.c_size) + " "; } + if (o == kArgAPOffset) { result += "apsize" + equals + ToString(args.ap_size) + " "; } + if (o == kArgDotOffset){ result += "scalarsize" + equals + ToString(args.scalar_size) + " "; } + } + return result; +} + +// ================================================================================================= + +// A test can either pass, be skipped, or fail +template +void Tester::ReportPass() { + num_passed_++; +} +template +void Tester::ReportSkipped() { + num_skipped_++; +} +template +void Tester::ReportError(const ErrorLogEntry &error_log_entry) { + error_log_.push_back(error_log_entry); + num_failed_++; +} + +// ================================================================================================= + +// Prints the test-result symbol to screen. This function limits the maximum number of symbols per +// line by printing newlines once every so many calls. +template +void Tester::PrintTestResult(const std::string &message) { + if (verbose_) { + fprintf(stdout, "%s\n", message.c_str()); + } + else + { + if (print_count_ == kResultsPerLine) { + print_count_ = 0; + fprintf(stdout, "\n "); + } + fprintf(stdout, "%s", message.c_str()); + print_count_++; + } + std::cout << std::flush; +} + +// Prints details of errors occurred in a given error log +template +void Tester::PrintErrorLog(const std::vector &error_log) { + for (auto &entry: error_log) { + if (entry.error_percentage != kStatusError) { + fprintf(stdout, " Error rate %.1lf%%: ", entry.error_percentage); + } + else { + fprintf(stdout, " Status code %d (expected %d): ", entry.status_found, entry.status_expect); + } + fprintf(stdout, "%s\n", GetOptionsString(entry.args).c_str()); + } +} + +// ================================================================================================= +// Below are the non-member functions (separated because of otherwise required partial class +// template specialization) +// ================================================================================================= + +// Compares two floating point values and returns whether they are within an acceptable error +// margin. This replaces GTest's EXPECT_NEAR(). +template +bool TestSimilarity(const T val1, const T val2) { + const auto difference = std::fabs(val1 - val2); + + // Set the allowed error margin for floating-point comparisons + constexpr auto kErrorMarginRelative = T(0.025); + constexpr auto kErrorMarginAbsolute = T(1.0e-3); + + // Shortcut, handles infinities + if (val1 == val2) { + return true; + } + // The values are zero or very small: the relative error is less meaningful + else if (val1 == 0 || val2 == 0 || difference < kErrorMarginAbsolute) { + return (difference < kErrorMarginAbsolute); + } + // Use relative error + else { + const auto absolute_sum = std::fabs(val1) + std::fabs(val2); + return (difference / absolute_sum) < kErrorMarginRelative; + } +} + +// Compiles the default case for standard data-types +template bool TestSimilarity(const float, const float); +template bool TestSimilarity(const double, const double); + +// Specialisations for non-standard data-types +template <> +bool TestSimilarity(const float2 val1, const float2 val2) { + auto real = TestSimilarity(val1.real(), val2.real()); + auto imag = TestSimilarity(val1.imag(), val2.imag()); + return (real && imag); +} +template <> +bool TestSimilarity(const double2 val1, const double2 val2) { + auto real = TestSimilarity(val1.real(), val2.real()); + auto imag = TestSimilarity(val1.imag(), val2.imag()); + return (real && imag); +} +template <> +bool TestSimilarity(const half val1, const half val2) { + return TestSimilarity(HalfToFloat(val1), HalfToFloat(val2)); +} + +// ================================================================================================= + +// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various +// routines. This function is specialised for the different data-types. +template <> const std::vector GetExampleScalars(const bool full_test) { + if (full_test) { return {0.0f, 1.0f, 3.14f}; } + else { return {3.14f}; } +} +template <> const std::vector GetExampleScalars(const bool full_test) { + if (full_test) { return {0.0, 1.0, 3.14}; } + else { return {3.14}; } +} +template <> const std::vector GetExampleScalars(const bool full_test) { + if (full_test) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; } + else { return {{2.42f, 3.14f}}; } +} +template <> const std::vector GetExampleScalars(const bool full_test) { + if (full_test) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; } + else { return {{2.42, 3.14}}; } +} +template <> const std::vector GetExampleScalars(const bool full_test) { + if (full_test) { return {FloatToHalf(0.0f), FloatToHalf(1.0f), FloatToHalf(3.14f)}; } + else { return {FloatToHalf(3.14f)}; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Tester; +template class Tester; +template class Tester; +template class Tester; +template class Tester; +template class Tester; +template class Tester; + +// ================================================================================================= +} // namespace clblast diff --git a/test/performance/client.cc b/test/performance/client.cc deleted file mode 100644 index d0068f8b..00000000 --- a/test/performance/client.cc +++ /dev/null @@ -1,375 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file implements the common functions for the client-test environment. -// -// ================================================================================================= - -#include "test/performance/client.hpp" - -#include -#include -#include -#include -#include - -namespace clblast { -// ================================================================================================= - -// Constructor -template -Client::Client(const Routine run_routine, - const Routine run_reference1, const Routine run_reference2, - const std::vector &options, - const GetMetric get_flops, const GetMetric get_bytes): - run_routine_(run_routine), - run_reference1_(run_reference1), - run_reference2_(run_reference2), - options_(options), - get_flops_(get_flops), - get_bytes_(get_bytes) { -} - -// ================================================================================================= - -// Parses all arguments available for the CLBlast client testers. Some arguments might not be -// applicable, but are searched for anyway to be able to create one common argument parser. All -// arguments have a default value in case they are not found. -template -Arguments Client::ParseArguments(int argc, char *argv[], const size_t level, - const GetMetric default_a_ld, - const GetMetric default_b_ld, - const GetMetric default_c_ld) { - auto args = Arguments{}; - auto help = std::string{"\n* Options given/available:\n"}; - - // These are the options which are not for every client: they are optional - for (auto &o: options_) { - - // Data-sizes - if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, size_t{512}); } - if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, size_t{512}); } - if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, size_t{512}); } - if (o == kArgKU) { args.ku = GetArgument(argc, argv, help, kArgKU, size_t{128}); } - if (o == kArgKL) { args.kl = GetArgument(argc, argv, help, kArgKL, size_t{128}); } - - // Data-layouts - if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); } - if (o == kArgATransp) { args.a_transpose = GetArgument(argc, argv, help, kArgATransp, Transpose::kNo); } - if (o == kArgBTransp) { args.b_transpose = GetArgument(argc, argv, help, kArgBTransp, Transpose::kNo); } - if (o == kArgSide) { args.side = GetArgument(argc, argv, help, kArgSide, Side::kLeft); } - if (o == kArgTriangle) { args.triangle = GetArgument(argc, argv, help, kArgTriangle, Triangle::kUpper); } - if (o == kArgDiagonal) { args.diagonal = GetArgument(argc, argv, help, kArgDiagonal, Diagonal::kUnit); } - - // Vector arguments - if (o == kArgXInc) { args.x_inc = GetArgument(argc, argv, help, kArgXInc, size_t{1}); } - if (o == kArgYInc) { args.y_inc = GetArgument(argc, argv, help, kArgYInc, size_t{1}); } - if (o == kArgXOffset) { args.x_offset = GetArgument(argc, argv, help, kArgXOffset, size_t{0}); } - if (o == kArgYOffset) { args.y_offset = GetArgument(argc, argv, help, kArgYOffset, size_t{0}); } - - // Matrix arguments - if (o == kArgALeadDim) { args.a_ld = GetArgument(argc, argv, help, kArgALeadDim, default_a_ld(args)); } - if (o == kArgBLeadDim) { args.b_ld = GetArgument(argc, argv, help, kArgBLeadDim, default_b_ld(args)); } - if (o == kArgCLeadDim) { args.c_ld = GetArgument(argc, argv, help, kArgCLeadDim, default_c_ld(args)); } - if (o == kArgAOffset) { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); } - if (o == kArgBOffset) { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); } - if (o == kArgCOffset) { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); } - if (o == kArgAPOffset) { args.ap_offset= GetArgument(argc, argv, help, kArgAPOffset, size_t{0}); } - - // Scalar result arguments - if (o == kArgDotOffset) { args.dot_offset = GetArgument(argc, argv, help, kArgDotOffset, size_t{0}); } - if (o == kArgNrm2Offset) { args.nrm2_offset = GetArgument(argc, argv, help, kArgNrm2Offset, size_t{0}); } - if (o == kArgAsumOffset) { args.asum_offset = GetArgument(argc, argv, help, kArgAsumOffset, size_t{0}); } - if (o == kArgImaxOffset) { args.imax_offset = GetArgument(argc, argv, help, kArgImaxOffset, size_t{0}); } - - // Scalar values - if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar()); } - if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar()); } - } - - // These are the options common to all routines - args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); - args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); - args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); - #ifdef CLBLAST_REF_CLBLAS - args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1); - #else - args.compare_clblas = 0; - #endif - #ifdef CLBLAST_REF_CBLAS - args.compare_cblas = GetArgument(argc, argv, help, kArgComparecblas, 1); - #else - args.compare_cblas = 0; - #endif - args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1}); - args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0}); - args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10}); - args.print_help = CheckArgument(argc, argv, help, kArgHelp); - args.silent = CheckArgument(argc, argv, help, kArgQuiet); - args.no_abbrv = CheckArgument(argc, argv, help, kArgNoAbbreviations); - - // Prints the chosen (or defaulted) arguments to screen. This also serves as the help message, - // which is thus always displayed (unless silence is specified). - if (!args.silent) { fprintf(stdout, "%s\n", help.c_str()); } - - // Comparison against a non-BLAS routine is not supported - if (level == 4) { // level-4 == level-X - if (args.compare_clblas != 0 || args.compare_cblas != 0) { - if (!args.silent) { - fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for this non-BLAS routine\n\n"); - } - } - args.compare_clblas = 0; - args.compare_cblas = 0; - } - - // Comparison against clBLAS or a CPU BLAS library is not supported in case of half-precision - if (args.precision == Precision::kHalf) { - if (args.compare_clblas != 0 || args.compare_cblas != 0) { - if (!args.silent) { - fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for half-precision\n\n"); - } - } - args.compare_clblas = 0; - args.compare_cblas = 0; - } - - // Returns the arguments - return args; -} - -// ================================================================================================= - -// This is main performance tester -template -void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) { - - // Prints the header of the output table - PrintTableHeader(args); - - // Initializes OpenCL and the libraries - auto platform = Platform(args.platform_id); - auto device = Device(platform, args.device_id); - auto context = Context(device); - auto queue = Queue(context, device); - #ifdef CLBLAST_REF_CLBLAS - if (args.compare_clblas) { clblasSetup(); } - #endif - - // Iterates over all "num_step" values jumping by "step" each time - auto s = size_t{0}; - while(true) { - - // Sets the buffer sizes (routine-specific) - set_sizes(args); - - // Populates input host matrices with random data - std::vector x_source(args.x_size); - std::vector y_source(args.y_size); - std::vector a_source(args.a_size); - std::vector b_source(args.b_size); - std::vector c_source(args.c_size); - std::vector ap_source(args.ap_size); - std::vector scalar_source(args.scalar_size); - PopulateVector(x_source); - PopulateVector(y_source); - PopulateVector(a_source); - PopulateVector(b_source); - PopulateVector(c_source); - PopulateVector(ap_source); - PopulateVector(scalar_source); - - // Creates the matrices on the device - auto x_vec = Buffer(context, args.x_size); - auto y_vec = Buffer(context, args.y_size); - auto a_mat = Buffer(context, args.a_size); - auto b_mat = Buffer(context, args.b_size); - auto c_mat = Buffer(context, args.c_size); - auto ap_mat = Buffer(context, args.ap_size); - auto scalar = Buffer(context, args.scalar_size); - x_vec.Write(queue, args.x_size, x_source); - y_vec.Write(queue, args.y_size, y_source); - a_mat.Write(queue, args.a_size, a_source); - b_mat.Write(queue, args.b_size, b_source); - c_mat.Write(queue, args.c_size, c_source); - ap_mat.Write(queue, args.ap_size, ap_source); - scalar.Write(queue, args.scalar_size, scalar_source); - auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar}; - - // Runs the routines and collects the timings - auto timings = std::vector>(); - auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast"); - timings.push_back(std::pair("CLBlast", ms_clblast)); - if (args.compare_clblas) { - auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS"); - timings.push_back(std::pair("clBLAS", ms_clblas)); - } - if (args.compare_cblas) { - auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS"); - timings.push_back(std::pair("CPU BLAS", ms_cblas)); - } - - // Prints the performance of the tested libraries - PrintTableRow(args, timings); - - // Makes the jump to the next step - ++s; - if (s >= args.num_steps) { break; } - args.m += args.step; - args.n += args.step; - args.k += args.step; - args.a_ld += args.step; - args.b_ld += args.step; - args.c_ld += args.step; - } - - // Cleans-up and returns - #ifdef CLBLAST_REF_CLBLAS - if (args.compare_clblas) { clblasTeardown(); } - #endif -} - -// ================================================================================================= - -// Creates a vector of timing results, filled with execution times of the 'main computation'. The -// timing is performed using the milliseconds chrono functions. The function returns the minimum -// value found in the vector of timing results. The return value is in milliseconds. -template -double Client::TimedExecution(const size_t num_runs, const Arguments &args, - Buffers &buffers, Queue &queue, - Routine run_blas, const std::string &library_name) { - auto timings = std::vector(num_runs); - for (auto &timing: timings) { - auto start_time = std::chrono::steady_clock::now(); - - // Executes the main computation - auto status = StatusCode::kSuccess; - try { - status = run_blas(args, buffers, queue); - } catch (...) { status = static_cast(kUnknownError); } - if (status != StatusCode::kSuccess) { - throw std::runtime_error(library_name+" error: "+ToString(static_cast(status))); - } - - // Records and stores the end-time - auto elapsed_time = std::chrono::steady_clock::now() - start_time; - timing = std::chrono::duration(elapsed_time).count(); - } - return *std::min_element(timings.begin(), timings.end()); -} - -// ================================================================================================= - -// Prints the header of the performance table -template -void Client::PrintTableHeader(const Arguments& args) { - - // First line (optional) - if (!args.silent) { - for (auto i=size_t{0}; i"); - if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); } - if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); } - fprintf(stdout, " |\n"); - } - - // Second line - for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); } - fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1"); - if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); } - if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); } - fprintf(stdout, "\n"); -} - -// Print a performance-result row -template -void Client::PrintTableRow(const Arguments& args, - const std::vector>& timings) { - - // Creates a vector of relevant variables - auto integers = std::vector{}; - for (auto &o: options_) { - if (o == kArgM) { integers.push_back(args.m); } - else if (o == kArgN) { integers.push_back(args.n); } - else if (o == kArgK) { integers.push_back(args.k); } - else if (o == kArgKU) { integers.push_back(args.ku); } - else if (o == kArgKL) { integers.push_back(args.kl); } - else if (o == kArgLayout) { integers.push_back(static_cast(args.layout)); } - else if (o == kArgSide) { integers.push_back(static_cast(args.side)); } - else if (o == kArgTriangle) { integers.push_back(static_cast(args.triangle)); } - else if (o == kArgATransp) { integers.push_back(static_cast(args.a_transpose)); } - else if (o == kArgBTransp) { integers.push_back(static_cast(args.b_transpose)); } - else if (o == kArgDiagonal) { integers.push_back(static_cast(args.diagonal)); } - else if (o == kArgXInc) { integers.push_back(args.x_inc); } - else if (o == kArgYInc) { integers.push_back(args.y_inc); } - else if (o == kArgXOffset) { integers.push_back(args.x_offset); } - else if (o == kArgYOffset) { integers.push_back(args.y_offset); } - else if (o == kArgALeadDim) { integers.push_back(args.a_ld); } - else if (o == kArgBLeadDim) { integers.push_back(args.b_ld); } - else if (o == kArgCLeadDim) { integers.push_back(args.c_ld); } - else if (o == kArgAOffset) { integers.push_back(args.a_offset); } - else if (o == kArgBOffset) { integers.push_back(args.b_offset); } - else if (o == kArgCOffset) { integers.push_back(args.c_offset); } - else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); } - else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); } - else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); } - else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); } - else if (o == kArgImaxOffset){integers.push_back(args.imax_offset); } - } - auto strings = std::vector{}; - for (auto &o: options_) { - if (o == kArgAlpha) { strings.push_back(ToString(args.alpha)); } - else if (o == kArgBeta) { strings.push_back(ToString(args.beta)); } - } - - // Outputs the argument values - for (auto &argument: integers) { - if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) { - fprintf(stdout, "%8zuM;", argument/(1024*1024)); - } - else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) { - fprintf(stdout, "%8zuK;", argument/1024); - } - else { - fprintf(stdout, "%9zu;", argument); - } - } - for (auto &argument: strings) { - fprintf(stdout, "%9s;", argument.c_str()); - } - - // Loops over all tested libraries - for (const auto& timing : timings) { - - // Computes the GFLOPS and GB/s metrics - auto flops = get_flops_(args); - auto bytes = get_bytes_(args); - auto gflops = (timing.second != 0.0) ? (flops*1e-6)/timing.second : 0; - auto gbs = (timing.second != 0.0) ? (bytes*1e-6)/timing.second : 0; - - // Outputs the performance numbers - if (timing.first != "CLBlast") { fprintf(stdout, ";"); } - fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", timing.second, gflops, gbs); - } - fprintf(stdout, "\n"); -} - -// ================================================================================================= - -// Compiles the templated class -template class Client; -template class Client; -template class Client; -template class Client; -template class Client; -template class Client; -template class Client; - -// ================================================================================================= -} // namespace clblast diff --git a/test/performance/client.cpp b/test/performance/client.cpp new file mode 100644 index 00000000..d0068f8b --- /dev/null +++ b/test/performance/client.cpp @@ -0,0 +1,375 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the common functions for the client-test environment. +// +// ================================================================================================= + +#include "test/performance/client.hpp" + +#include +#include +#include +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor +template +Client::Client(const Routine run_routine, + const Routine run_reference1, const Routine run_reference2, + const std::vector &options, + const GetMetric get_flops, const GetMetric get_bytes): + run_routine_(run_routine), + run_reference1_(run_reference1), + run_reference2_(run_reference2), + options_(options), + get_flops_(get_flops), + get_bytes_(get_bytes) { +} + +// ================================================================================================= + +// Parses all arguments available for the CLBlast client testers. Some arguments might not be +// applicable, but are searched for anyway to be able to create one common argument parser. All +// arguments have a default value in case they are not found. +template +Arguments Client::ParseArguments(int argc, char *argv[], const size_t level, + const GetMetric default_a_ld, + const GetMetric default_b_ld, + const GetMetric default_c_ld) { + auto args = Arguments{}; + auto help = std::string{"\n* Options given/available:\n"}; + + // These are the options which are not for every client: they are optional + for (auto &o: options_) { + + // Data-sizes + if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, size_t{512}); } + if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, size_t{512}); } + if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, size_t{512}); } + if (o == kArgKU) { args.ku = GetArgument(argc, argv, help, kArgKU, size_t{128}); } + if (o == kArgKL) { args.kl = GetArgument(argc, argv, help, kArgKL, size_t{128}); } + + // Data-layouts + if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); } + if (o == kArgATransp) { args.a_transpose = GetArgument(argc, argv, help, kArgATransp, Transpose::kNo); } + if (o == kArgBTransp) { args.b_transpose = GetArgument(argc, argv, help, kArgBTransp, Transpose::kNo); } + if (o == kArgSide) { args.side = GetArgument(argc, argv, help, kArgSide, Side::kLeft); } + if (o == kArgTriangle) { args.triangle = GetArgument(argc, argv, help, kArgTriangle, Triangle::kUpper); } + if (o == kArgDiagonal) { args.diagonal = GetArgument(argc, argv, help, kArgDiagonal, Diagonal::kUnit); } + + // Vector arguments + if (o == kArgXInc) { args.x_inc = GetArgument(argc, argv, help, kArgXInc, size_t{1}); } + if (o == kArgYInc) { args.y_inc = GetArgument(argc, argv, help, kArgYInc, size_t{1}); } + if (o == kArgXOffset) { args.x_offset = GetArgument(argc, argv, help, kArgXOffset, size_t{0}); } + if (o == kArgYOffset) { args.y_offset = GetArgument(argc, argv, help, kArgYOffset, size_t{0}); } + + // Matrix arguments + if (o == kArgALeadDim) { args.a_ld = GetArgument(argc, argv, help, kArgALeadDim, default_a_ld(args)); } + if (o == kArgBLeadDim) { args.b_ld = GetArgument(argc, argv, help, kArgBLeadDim, default_b_ld(args)); } + if (o == kArgCLeadDim) { args.c_ld = GetArgument(argc, argv, help, kArgCLeadDim, default_c_ld(args)); } + if (o == kArgAOffset) { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); } + if (o == kArgBOffset) { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); } + if (o == kArgCOffset) { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); } + if (o == kArgAPOffset) { args.ap_offset= GetArgument(argc, argv, help, kArgAPOffset, size_t{0}); } + + // Scalar result arguments + if (o == kArgDotOffset) { args.dot_offset = GetArgument(argc, argv, help, kArgDotOffset, size_t{0}); } + if (o == kArgNrm2Offset) { args.nrm2_offset = GetArgument(argc, argv, help, kArgNrm2Offset, size_t{0}); } + if (o == kArgAsumOffset) { args.asum_offset = GetArgument(argc, argv, help, kArgAsumOffset, size_t{0}); } + if (o == kArgImaxOffset) { args.imax_offset = GetArgument(argc, argv, help, kArgImaxOffset, size_t{0}); } + + // Scalar values + if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar()); } + if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar()); } + } + + // These are the options common to all routines + args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); + args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); + args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); + #ifdef CLBLAST_REF_CLBLAS + args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1); + #else + args.compare_clblas = 0; + #endif + #ifdef CLBLAST_REF_CBLAS + args.compare_cblas = GetArgument(argc, argv, help, kArgComparecblas, 1); + #else + args.compare_cblas = 0; + #endif + args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1}); + args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0}); + args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10}); + args.print_help = CheckArgument(argc, argv, help, kArgHelp); + args.silent = CheckArgument(argc, argv, help, kArgQuiet); + args.no_abbrv = CheckArgument(argc, argv, help, kArgNoAbbreviations); + + // Prints the chosen (or defaulted) arguments to screen. This also serves as the help message, + // which is thus always displayed (unless silence is specified). + if (!args.silent) { fprintf(stdout, "%s\n", help.c_str()); } + + // Comparison against a non-BLAS routine is not supported + if (level == 4) { // level-4 == level-X + if (args.compare_clblas != 0 || args.compare_cblas != 0) { + if (!args.silent) { + fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for this non-BLAS routine\n\n"); + } + } + args.compare_clblas = 0; + args.compare_cblas = 0; + } + + // Comparison against clBLAS or a CPU BLAS library is not supported in case of half-precision + if (args.precision == Precision::kHalf) { + if (args.compare_clblas != 0 || args.compare_cblas != 0) { + if (!args.silent) { + fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for half-precision\n\n"); + } + } + args.compare_clblas = 0; + args.compare_cblas = 0; + } + + // Returns the arguments + return args; +} + +// ================================================================================================= + +// This is main performance tester +template +void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) { + + // Prints the header of the output table + PrintTableHeader(args); + + // Initializes OpenCL and the libraries + auto platform = Platform(args.platform_id); + auto device = Device(platform, args.device_id); + auto context = Context(device); + auto queue = Queue(context, device); + #ifdef CLBLAST_REF_CLBLAS + if (args.compare_clblas) { clblasSetup(); } + #endif + + // Iterates over all "num_step" values jumping by "step" each time + auto s = size_t{0}; + while(true) { + + // Sets the buffer sizes (routine-specific) + set_sizes(args); + + // Populates input host matrices with random data + std::vector x_source(args.x_size); + std::vector y_source(args.y_size); + std::vector a_source(args.a_size); + std::vector b_source(args.b_size); + std::vector c_source(args.c_size); + std::vector ap_source(args.ap_size); + std::vector scalar_source(args.scalar_size); + PopulateVector(x_source); + PopulateVector(y_source); + PopulateVector(a_source); + PopulateVector(b_source); + PopulateVector(c_source); + PopulateVector(ap_source); + PopulateVector(scalar_source); + + // Creates the matrices on the device + auto x_vec = Buffer(context, args.x_size); + auto y_vec = Buffer(context, args.y_size); + auto a_mat = Buffer(context, args.a_size); + auto b_mat = Buffer(context, args.b_size); + auto c_mat = Buffer(context, args.c_size); + auto ap_mat = Buffer(context, args.ap_size); + auto scalar = Buffer(context, args.scalar_size); + x_vec.Write(queue, args.x_size, x_source); + y_vec.Write(queue, args.y_size, y_source); + a_mat.Write(queue, args.a_size, a_source); + b_mat.Write(queue, args.b_size, b_source); + c_mat.Write(queue, args.c_size, c_source); + ap_mat.Write(queue, args.ap_size, ap_source); + scalar.Write(queue, args.scalar_size, scalar_source); + auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar}; + + // Runs the routines and collects the timings + auto timings = std::vector>(); + auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast"); + timings.push_back(std::pair("CLBlast", ms_clblast)); + if (args.compare_clblas) { + auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS"); + timings.push_back(std::pair("clBLAS", ms_clblas)); + } + if (args.compare_cblas) { + auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS"); + timings.push_back(std::pair("CPU BLAS", ms_cblas)); + } + + // Prints the performance of the tested libraries + PrintTableRow(args, timings); + + // Makes the jump to the next step + ++s; + if (s >= args.num_steps) { break; } + args.m += args.step; + args.n += args.step; + args.k += args.step; + args.a_ld += args.step; + args.b_ld += args.step; + args.c_ld += args.step; + } + + // Cleans-up and returns + #ifdef CLBLAST_REF_CLBLAS + if (args.compare_clblas) { clblasTeardown(); } + #endif +} + +// ================================================================================================= + +// Creates a vector of timing results, filled with execution times of the 'main computation'. The +// timing is performed using the milliseconds chrono functions. The function returns the minimum +// value found in the vector of timing results. The return value is in milliseconds. +template +double Client::TimedExecution(const size_t num_runs, const Arguments &args, + Buffers &buffers, Queue &queue, + Routine run_blas, const std::string &library_name) { + auto timings = std::vector(num_runs); + for (auto &timing: timings) { + auto start_time = std::chrono::steady_clock::now(); + + // Executes the main computation + auto status = StatusCode::kSuccess; + try { + status = run_blas(args, buffers, queue); + } catch (...) { status = static_cast(kUnknownError); } + if (status != StatusCode::kSuccess) { + throw std::runtime_error(library_name+" error: "+ToString(static_cast(status))); + } + + // Records and stores the end-time + auto elapsed_time = std::chrono::steady_clock::now() - start_time; + timing = std::chrono::duration(elapsed_time).count(); + } + return *std::min_element(timings.begin(), timings.end()); +} + +// ================================================================================================= + +// Prints the header of the performance table +template +void Client::PrintTableHeader(const Arguments& args) { + + // First line (optional) + if (!args.silent) { + for (auto i=size_t{0}; i"); + if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); } + if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); } + fprintf(stdout, " |\n"); + } + + // Second line + for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); } + fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1"); + if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); } + if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); } + fprintf(stdout, "\n"); +} + +// Print a performance-result row +template +void Client::PrintTableRow(const Arguments& args, + const std::vector>& timings) { + + // Creates a vector of relevant variables + auto integers = std::vector{}; + for (auto &o: options_) { + if (o == kArgM) { integers.push_back(args.m); } + else if (o == kArgN) { integers.push_back(args.n); } + else if (o == kArgK) { integers.push_back(args.k); } + else if (o == kArgKU) { integers.push_back(args.ku); } + else if (o == kArgKL) { integers.push_back(args.kl); } + else if (o == kArgLayout) { integers.push_back(static_cast(args.layout)); } + else if (o == kArgSide) { integers.push_back(static_cast(args.side)); } + else if (o == kArgTriangle) { integers.push_back(static_cast(args.triangle)); } + else if (o == kArgATransp) { integers.push_back(static_cast(args.a_transpose)); } + else if (o == kArgBTransp) { integers.push_back(static_cast(args.b_transpose)); } + else if (o == kArgDiagonal) { integers.push_back(static_cast(args.diagonal)); } + else if (o == kArgXInc) { integers.push_back(args.x_inc); } + else if (o == kArgYInc) { integers.push_back(args.y_inc); } + else if (o == kArgXOffset) { integers.push_back(args.x_offset); } + else if (o == kArgYOffset) { integers.push_back(args.y_offset); } + else if (o == kArgALeadDim) { integers.push_back(args.a_ld); } + else if (o == kArgBLeadDim) { integers.push_back(args.b_ld); } + else if (o == kArgCLeadDim) { integers.push_back(args.c_ld); } + else if (o == kArgAOffset) { integers.push_back(args.a_offset); } + else if (o == kArgBOffset) { integers.push_back(args.b_offset); } + else if (o == kArgCOffset) { integers.push_back(args.c_offset); } + else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); } + else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); } + else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); } + else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); } + else if (o == kArgImaxOffset){integers.push_back(args.imax_offset); } + } + auto strings = std::vector{}; + for (auto &o: options_) { + if (o == kArgAlpha) { strings.push_back(ToString(args.alpha)); } + else if (o == kArgBeta) { strings.push_back(ToString(args.beta)); } + } + + // Outputs the argument values + for (auto &argument: integers) { + if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) { + fprintf(stdout, "%8zuM;", argument/(1024*1024)); + } + else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) { + fprintf(stdout, "%8zuK;", argument/1024); + } + else { + fprintf(stdout, "%9zu;", argument); + } + } + for (auto &argument: strings) { + fprintf(stdout, "%9s;", argument.c_str()); + } + + // Loops over all tested libraries + for (const auto& timing : timings) { + + // Computes the GFLOPS and GB/s metrics + auto flops = get_flops_(args); + auto bytes = get_bytes_(args); + auto gflops = (timing.second != 0.0) ? (flops*1e-6)/timing.second : 0; + auto gbs = (timing.second != 0.0) ? (bytes*1e-6)/timing.second : 0; + + // Outputs the performance numbers + if (timing.first != "CLBlast") { fprintf(stdout, ";"); } + fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", timing.second, gflops, gbs); + } + fprintf(stdout, "\n"); +} + +// ================================================================================================= + +// Compiles the templated class +template class Client; +template class Client; +template class Client; +template class Client; +template class Client; +template class Client; +template class Client; + +// ================================================================================================= +} // namespace clblast diff --git a/test/performance/routines/level1/xamax.cc b/test/performance/routines/level1/xamax.cc deleted file mode 100644 index 450678e0..00000000 --- a/test/performance/routines/level1/xamax.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xamax.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xamax.cpp b/test/performance/routines/level1/xamax.cpp new file mode 100644 index 00000000..450678e0 --- /dev/null +++ b/test/performance/routines/level1/xamax.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xamax.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xasum.cc b/test/performance/routines/level1/xasum.cc deleted file mode 100644 index c21102f5..00000000 --- a/test/performance/routines/level1/xasum.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xasum.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xasum.cpp b/test/performance/routines/level1/xasum.cpp new file mode 100644 index 00000000..c21102f5 --- /dev/null +++ b/test/performance/routines/level1/xasum.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xasum.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xaxpy.cc b/test/performance/routines/level1/xaxpy.cc deleted file mode 100644 index e1c4935e..00000000 --- a/test/performance/routines/level1/xaxpy.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xaxpy.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xaxpy.cpp b/test/performance/routines/level1/xaxpy.cpp new file mode 100644 index 00000000..e1c4935e --- /dev/null +++ b/test/performance/routines/level1/xaxpy.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xaxpy.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xcopy.cc b/test/performance/routines/level1/xcopy.cc deleted file mode 100644 index ea3531a0..00000000 --- a/test/performance/routines/level1/xcopy.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xcopy.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xcopy.cpp b/test/performance/routines/level1/xcopy.cpp new file mode 100644 index 00000000..ea3531a0 --- /dev/null +++ b/test/performance/routines/level1/xcopy.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xcopy.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xdot.cc b/test/performance/routines/level1/xdot.cc deleted file mode 100644 index 09fe9258..00000000 --- a/test/performance/routines/level1/xdot.cc +++ /dev/null @@ -1,34 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xdot.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xdot.cpp b/test/performance/routines/level1/xdot.cpp new file mode 100644 index 00000000..09fe9258 --- /dev/null +++ b/test/performance/routines/level1/xdot.cpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xdot.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xdotc.cc b/test/performance/routines/level1/xdotc.cc deleted file mode 100644 index 6e716ebb..00000000 --- a/test/performance/routines/level1/xdotc.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xdotc.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xdotc.cpp b/test/performance/routines/level1/xdotc.cpp new file mode 100644 index 00000000..6e716ebb --- /dev/null +++ b/test/performance/routines/level1/xdotc.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xdotc.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xdotu.cc b/test/performance/routines/level1/xdotu.cc deleted file mode 100644 index d011d558..00000000 --- a/test/performance/routines/level1/xdotu.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xdotu.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xdotu.cpp b/test/performance/routines/level1/xdotu.cpp new file mode 100644 index 00000000..d011d558 --- /dev/null +++ b/test/performance/routines/level1/xdotu.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xdotu.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xnrm2.cc b/test/performance/routines/level1/xnrm2.cc deleted file mode 100644 index 1d6e177d..00000000 --- a/test/performance/routines/level1/xnrm2.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xnrm2.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xnrm2.cpp b/test/performance/routines/level1/xnrm2.cpp new file mode 100644 index 00000000..1d6e177d --- /dev/null +++ b/test/performance/routines/level1/xnrm2.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xnrm2.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xrot.cc b/test/performance/routines/level1/xrot.cc deleted file mode 100644 index 4b543f1b..00000000 --- a/test/performance/routines/level1/xrot.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xrot.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xrot.cpp b/test/performance/routines/level1/xrot.cpp new file mode 100644 index 00000000..4b543f1b --- /dev/null +++ b/test/performance/routines/level1/xrot.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xrot.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xrotg.cc b/test/performance/routines/level1/xrotg.cc deleted file mode 100644 index e52704b0..00000000 --- a/test/performance/routines/level1/xrotg.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xrotg.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xrotg.cpp b/test/performance/routines/level1/xrotg.cpp new file mode 100644 index 00000000..e52704b0 --- /dev/null +++ b/test/performance/routines/level1/xrotg.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xrotg.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xrotm.cc b/test/performance/routines/level1/xrotm.cc deleted file mode 100644 index 83ee1d9d..00000000 --- a/test/performance/routines/level1/xrotm.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xrotm.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xrotm.cpp b/test/performance/routines/level1/xrotm.cpp new file mode 100644 index 00000000..83ee1d9d --- /dev/null +++ b/test/performance/routines/level1/xrotm.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xrotm.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xrotmg.cc b/test/performance/routines/level1/xrotmg.cc deleted file mode 100644 index ee1539d9..00000000 --- a/test/performance/routines/level1/xrotmg.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xrotmg.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xrotmg.cpp b/test/performance/routines/level1/xrotmg.cpp new file mode 100644 index 00000000..ee1539d9 --- /dev/null +++ b/test/performance/routines/level1/xrotmg.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xrotmg.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xscal.cc b/test/performance/routines/level1/xscal.cc deleted file mode 100644 index adb83a90..00000000 --- a/test/performance/routines/level1/xscal.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xscal.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xscal.cpp b/test/performance/routines/level1/xscal.cpp new file mode 100644 index 00000000..adb83a90 --- /dev/null +++ b/test/performance/routines/level1/xscal.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xscal.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xswap.cc b/test/performance/routines/level1/xswap.cc deleted file mode 100644 index 7f591d19..00000000 --- a/test/performance/routines/level1/xswap.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level1/xswap.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level1/xswap.cpp b/test/performance/routines/level1/xswap.cpp new file mode 100644 index 00000000..7f591d19 --- /dev/null +++ b/test/performance/routines/level1/xswap.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level1/xswap.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xgbmv.cc b/test/performance/routines/level2/xgbmv.cc deleted file mode 100644 index 6aa72ded..00000000 --- a/test/performance/routines/level2/xgbmv.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xgbmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xgbmv.cpp b/test/performance/routines/level2/xgbmv.cpp new file mode 100644 index 00000000..6aa72ded --- /dev/null +++ b/test/performance/routines/level2/xgbmv.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xgbmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xgemv.cc b/test/performance/routines/level2/xgemv.cc deleted file mode 100644 index fdcef95d..00000000 --- a/test/performance/routines/level2/xgemv.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xgemv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xgemv.cpp b/test/performance/routines/level2/xgemv.cpp new file mode 100644 index 00000000..fdcef95d --- /dev/null +++ b/test/performance/routines/level2/xgemv.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xgemv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xger.cc b/test/performance/routines/level2/xger.cc deleted file mode 100644 index c4f3699d..00000000 --- a/test/performance/routines/level2/xger.cc +++ /dev/null @@ -1,34 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xger.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xger.cpp b/test/performance/routines/level2/xger.cpp new file mode 100644 index 00000000..c4f3699d --- /dev/null +++ b/test/performance/routines/level2/xger.cpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xger.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xgerc.cc b/test/performance/routines/level2/xgerc.cc deleted file mode 100644 index f855dc11..00000000 --- a/test/performance/routines/level2/xgerc.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xgerc.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xgerc.cpp b/test/performance/routines/level2/xgerc.cpp new file mode 100644 index 00000000..f855dc11 --- /dev/null +++ b/test/performance/routines/level2/xgerc.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xgerc.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xgeru.cc b/test/performance/routines/level2/xgeru.cc deleted file mode 100644 index 2bf885e3..00000000 --- a/test/performance/routines/level2/xgeru.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xgeru.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xgeru.cpp b/test/performance/routines/level2/xgeru.cpp new file mode 100644 index 00000000..2bf885e3 --- /dev/null +++ b/test/performance/routines/level2/xgeru.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xgeru.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhbmv.cc b/test/performance/routines/level2/xhbmv.cc deleted file mode 100644 index b7f3b9ad..00000000 --- a/test/performance/routines/level2/xhbmv.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xhbmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xhbmv.cpp b/test/performance/routines/level2/xhbmv.cpp new file mode 100644 index 00000000..b7f3b9ad --- /dev/null +++ b/test/performance/routines/level2/xhbmv.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xhbmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhemv.cc b/test/performance/routines/level2/xhemv.cc deleted file mode 100644 index e1168083..00000000 --- a/test/performance/routines/level2/xhemv.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xhemv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xhemv.cpp b/test/performance/routines/level2/xhemv.cpp new file mode 100644 index 00000000..e1168083 --- /dev/null +++ b/test/performance/routines/level2/xhemv.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xhemv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xher.cc b/test/performance/routines/level2/xher.cc deleted file mode 100644 index 0d1bc1dd..00000000 --- a/test/performance/routines/level2/xher.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xher.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xher.cpp b/test/performance/routines/level2/xher.cpp new file mode 100644 index 00000000..0d1bc1dd --- /dev/null +++ b/test/performance/routines/level2/xher.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xher.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xher2.cc b/test/performance/routines/level2/xher2.cc deleted file mode 100644 index 3d98c838..00000000 --- a/test/performance/routines/level2/xher2.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xher2.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xher2.cpp b/test/performance/routines/level2/xher2.cpp new file mode 100644 index 00000000..3d98c838 --- /dev/null +++ b/test/performance/routines/level2/xher2.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xher2.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhpmv.cc b/test/performance/routines/level2/xhpmv.cc deleted file mode 100644 index c3bc3d9c..00000000 --- a/test/performance/routines/level2/xhpmv.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xhpmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xhpmv.cpp b/test/performance/routines/level2/xhpmv.cpp new file mode 100644 index 00000000..c3bc3d9c --- /dev/null +++ b/test/performance/routines/level2/xhpmv.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xhpmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhpr.cc b/test/performance/routines/level2/xhpr.cc deleted file mode 100644 index afc65b25..00000000 --- a/test/performance/routines/level2/xhpr.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xhpr.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xhpr.cpp b/test/performance/routines/level2/xhpr.cpp new file mode 100644 index 00000000..afc65b25 --- /dev/null +++ b/test/performance/routines/level2/xhpr.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xhpr.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhpr2.cc b/test/performance/routines/level2/xhpr2.cc deleted file mode 100644 index c543dc90..00000000 --- a/test/performance/routines/level2/xhpr2.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xhpr2.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xhpr2.cpp b/test/performance/routines/level2/xhpr2.cpp new file mode 100644 index 00000000..c543dc90 --- /dev/null +++ b/test/performance/routines/level2/xhpr2.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xhpr2.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xsbmv.cc b/test/performance/routines/level2/xsbmv.cc deleted file mode 100644 index 32899a74..00000000 --- a/test/performance/routines/level2/xsbmv.cc +++ /dev/null @@ -1,34 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xsbmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xsbmv.cpp b/test/performance/routines/level2/xsbmv.cpp new file mode 100644 index 00000000..32899a74 --- /dev/null +++ b/test/performance/routines/level2/xsbmv.cpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xsbmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xspmv.cc b/test/performance/routines/level2/xspmv.cc deleted file mode 100644 index 0b0d2409..00000000 --- a/test/performance/routines/level2/xspmv.cc +++ /dev/null @@ -1,34 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xspmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xspmv.cpp b/test/performance/routines/level2/xspmv.cpp new file mode 100644 index 00000000..0b0d2409 --- /dev/null +++ b/test/performance/routines/level2/xspmv.cpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xspmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xspr.cc b/test/performance/routines/level2/xspr.cc deleted file mode 100644 index 9c1c80a0..00000000 --- a/test/performance/routines/level2/xspr.cc +++ /dev/null @@ -1,34 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xspr.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xspr.cpp b/test/performance/routines/level2/xspr.cpp new file mode 100644 index 00000000..9c1c80a0 --- /dev/null +++ b/test/performance/routines/level2/xspr.cpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xspr.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xspr2.cc b/test/performance/routines/level2/xspr2.cc deleted file mode 100644 index 117e9c2f..00000000 --- a/test/performance/routines/level2/xspr2.cc +++ /dev/null @@ -1,34 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xspr2.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xspr2.cpp b/test/performance/routines/level2/xspr2.cpp new file mode 100644 index 00000000..117e9c2f --- /dev/null +++ b/test/performance/routines/level2/xspr2.cpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xspr2.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xsymv.cc b/test/performance/routines/level2/xsymv.cc deleted file mode 100644 index 60db1ae9..00000000 --- a/test/performance/routines/level2/xsymv.cc +++ /dev/null @@ -1,34 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xsymv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xsymv.cpp b/test/performance/routines/level2/xsymv.cpp new file mode 100644 index 00000000..60db1ae9 --- /dev/null +++ b/test/performance/routines/level2/xsymv.cpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xsymv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xsyr.cc b/test/performance/routines/level2/xsyr.cc deleted file mode 100644 index d9ecd38a..00000000 --- a/test/performance/routines/level2/xsyr.cc +++ /dev/null @@ -1,34 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xsyr.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xsyr.cpp b/test/performance/routines/level2/xsyr.cpp new file mode 100644 index 00000000..d9ecd38a --- /dev/null +++ b/test/performance/routines/level2/xsyr.cpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xsyr.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xsyr2.cc b/test/performance/routines/level2/xsyr2.cc deleted file mode 100644 index 24e0a517..00000000 --- a/test/performance/routines/level2/xsyr2.cc +++ /dev/null @@ -1,34 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xsyr2.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xsyr2.cpp b/test/performance/routines/level2/xsyr2.cpp new file mode 100644 index 00000000..24e0a517 --- /dev/null +++ b/test/performance/routines/level2/xsyr2.cpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xsyr2.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtbmv.cc b/test/performance/routines/level2/xtbmv.cc deleted file mode 100644 index ed9d26a8..00000000 --- a/test/performance/routines/level2/xtbmv.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xtbmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xtbmv.cpp b/test/performance/routines/level2/xtbmv.cpp new file mode 100644 index 00000000..ed9d26a8 --- /dev/null +++ b/test/performance/routines/level2/xtbmv.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xtbmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtbsv.cc b/test/performance/routines/level2/xtbsv.cc deleted file mode 100644 index f0b80330..00000000 --- a/test/performance/routines/level2/xtbsv.cc +++ /dev/null @@ -1,35 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xtbsv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xtbsv.cpp b/test/performance/routines/level2/xtbsv.cpp new file mode 100644 index 00000000..f0b80330 --- /dev/null +++ b/test/performance/routines/level2/xtbsv.cpp @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xtbsv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtpmv.cc b/test/performance/routines/level2/xtpmv.cc deleted file mode 100644 index c5801205..00000000 --- a/test/performance/routines/level2/xtpmv.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xtpmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xtpmv.cpp b/test/performance/routines/level2/xtpmv.cpp new file mode 100644 index 00000000..c5801205 --- /dev/null +++ b/test/performance/routines/level2/xtpmv.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xtpmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtpsv.cc b/test/performance/routines/level2/xtpsv.cc deleted file mode 100644 index db956c9d..00000000 --- a/test/performance/routines/level2/xtpsv.cc +++ /dev/null @@ -1,35 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xtpsv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xtpsv.cpp b/test/performance/routines/level2/xtpsv.cpp new file mode 100644 index 00000000..db956c9d --- /dev/null +++ b/test/performance/routines/level2/xtpsv.cpp @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xtpsv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtrmv.cc b/test/performance/routines/level2/xtrmv.cc deleted file mode 100644 index 629c773c..00000000 --- a/test/performance/routines/level2/xtrmv.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xtrmv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xtrmv.cpp b/test/performance/routines/level2/xtrmv.cpp new file mode 100644 index 00000000..629c773c --- /dev/null +++ b/test/performance/routines/level2/xtrmv.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xtrmv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtrsv.cc b/test/performance/routines/level2/xtrsv.cc deleted file mode 100644 index d6c2968c..00000000 --- a/test/performance/routines/level2/xtrsv.cc +++ /dev/null @@ -1,35 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level2/xtrsv.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level2/xtrsv.cpp b/test/performance/routines/level2/xtrsv.cpp new file mode 100644 index 00000000..d6c2968c --- /dev/null +++ b/test/performance/routines/level2/xtrsv.cpp @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level2/xtrsv.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xgemm.cc b/test/performance/routines/level3/xgemm.cc deleted file mode 100644 index 3f68096e..00000000 --- a/test/performance/routines/level3/xgemm.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level3/xgemm.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level3/xgemm.cpp b/test/performance/routines/level3/xgemm.cpp new file mode 100644 index 00000000..3f68096e --- /dev/null +++ b/test/performance/routines/level3/xgemm.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level3/xgemm.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xhemm.cc b/test/performance/routines/level3/xhemm.cc deleted file mode 100644 index ff6d0f71..00000000 --- a/test/performance/routines/level3/xhemm.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level3/xhemm.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level3/xhemm.cpp b/test/performance/routines/level3/xhemm.cpp new file mode 100644 index 00000000..ff6d0f71 --- /dev/null +++ b/test/performance/routines/level3/xhemm.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level3/xhemm.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xher2k.cc b/test/performance/routines/level3/xher2k.cc deleted file mode 100644 index 9636959e..00000000 --- a/test/performance/routines/level3/xher2k.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level3/xher2k.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level3/xher2k.cpp b/test/performance/routines/level3/xher2k.cpp new file mode 100644 index 00000000..9636959e --- /dev/null +++ b/test/performance/routines/level3/xher2k.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level3/xher2k.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xherk.cc b/test/performance/routines/level3/xherk.cc deleted file mode 100644 index d51400f0..00000000 --- a/test/performance/routines/level3/xherk.cc +++ /dev/null @@ -1,33 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level3/xherk.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level3/xherk.cpp b/test/performance/routines/level3/xherk.cpp new file mode 100644 index 00000000..d51400f0 --- /dev/null +++ b/test/performance/routines/level3/xherk.cpp @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level3/xherk.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xsymm.cc b/test/performance/routines/level3/xsymm.cc deleted file mode 100644 index 38c3dc9b..00000000 --- a/test/performance/routines/level3/xsymm.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level3/xsymm.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level3/xsymm.cpp b/test/performance/routines/level3/xsymm.cpp new file mode 100644 index 00000000..38c3dc9b --- /dev/null +++ b/test/performance/routines/level3/xsymm.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level3/xsymm.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xsyr2k.cc b/test/performance/routines/level3/xsyr2k.cc deleted file mode 100644 index 5360e297..00000000 --- a/test/performance/routines/level3/xsyr2k.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level3/xsyr2k.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level3/xsyr2k.cpp b/test/performance/routines/level3/xsyr2k.cpp new file mode 100644 index 00000000..5360e297 --- /dev/null +++ b/test/performance/routines/level3/xsyr2k.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level3/xsyr2k.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xsyrk.cc b/test/performance/routines/level3/xsyrk.cc deleted file mode 100644 index 30612f99..00000000 --- a/test/performance/routines/level3/xsyrk.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level3/xsyrk.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level3/xsyrk.cpp b/test/performance/routines/level3/xsyrk.cpp new file mode 100644 index 00000000..30612f99 --- /dev/null +++ b/test/performance/routines/level3/xsyrk.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level3/xsyrk.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xtrmm.cc b/test/performance/routines/level3/xtrmm.cc deleted file mode 100644 index 264a34e7..00000000 --- a/test/performance/routines/level3/xtrmm.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level3/xtrmm.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level3/xtrmm.cpp b/test/performance/routines/level3/xtrmm.cpp new file mode 100644 index 00000000..264a34e7 --- /dev/null +++ b/test/performance/routines/level3/xtrmm.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level3/xtrmm.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xtrsm.cc b/test/performance/routines/level3/xtrsm.cc deleted file mode 100644 index 80c46d91..00000000 --- a/test/performance/routines/level3/xtrsm.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/level3/xtrsm.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/level3/xtrsm.cpp b/test/performance/routines/level3/xtrsm.cpp new file mode 100644 index 00000000..80c46d91 --- /dev/null +++ b/test/performance/routines/level3/xtrsm.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/level3/xtrsm.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/levelx/xomatcopy.cc b/test/performance/routines/levelx/xomatcopy.cc deleted file mode 100644 index 0bd5773e..00000000 --- a/test/performance/routines/levelx/xomatcopy.cc +++ /dev/null @@ -1,36 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= - -#include "test/performance/client.hpp" -#include "test/routines/levelx/xomatcopy.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; - case clblast::Precision::kSingle: - clblast::RunClient, float, float>(argc, argv); break; - case clblast::Precision::kDouble: - clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - clblast::RunClient, float2, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: - clblast::RunClient, double2, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/test/performance/routines/levelx/xomatcopy.cpp b/test/performance/routines/levelx/xomatcopy.cpp new file mode 100644 index 00000000..0bd5773e --- /dev/null +++ b/test/performance/routines/levelx/xomatcopy.cpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/levelx/xomatcopy.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= -- cgit v1.2.3