From fa84ac36f23c1aebb5facf946b21d8c0f1a4a46d Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 28 Jul 2018 16:01:03 +0200 Subject: The tuners now also check for valid local thread configurations and skip invalid ones completely, saving compilation time --- CHANGELOG | 1 + 1 file changed, 1 insertion(+) (limited to 'CHANGELOG') diff --git a/CHANGELOG b/CHANGELOG index c1c639e1..f6d05df3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ Development (next version) - Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah') +- The tuners now check beforehand on invalid local thread sizes and skip those completely - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel - Various minor fixes and enhancements -- cgit v1.2.3 From 503ab74f020fe764fd2bd69d60ecd72f758b11a2 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 31 Jul 2018 21:49:37 +0200 Subject: Fixed issue with not performing complex conjugation under certain cases when transposing --- CHANGELOG | 1 + src/routines/common.hpp | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'CHANGELOG') diff --git a/CHANGELOG b/CHANGELOG index f6d05df3..3134e7bf 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ Development (next version) - Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah') - The tuners now check beforehand on invalid local thread sizes and skip those completely +- Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel - Various minor fixes and enhancements diff --git a/src/routines/common.hpp b/src/routines/common.hpp index c30a2e0e..c6db0152 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -76,6 +76,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, // Determines the right kernel auto kernel_name = std::string{}; + auto pad_kernel = false; if (do_transpose) { if (use_fast_kernel && IsMultiple(src_ld, db["TRA_WPT"]) && @@ -85,7 +86,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, } else { use_fast_kernel = false; - kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix"; + pad_kernel = (do_pad || do_conjugate); + kernel_name = (pad_kernel) ? "TransposePadMatrix" : "TransposeMatrix"; } } else { @@ -97,7 +99,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, } else { use_fast_kernel = false; - kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix"; + pad_kernel = do_pad; + kernel_name = (pad_kernel) ? "CopyPadMatrix" : "CopyMatrix"; } } @@ -123,7 +126,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, kernel.SetArgument(8, static_cast(dest_offset)); kernel.SetArgument(9, dest()); kernel.SetArgument(10, GetRealArg(alpha)); - if (do_pad) { + if (pad_kernel) { kernel.SetArgument(11, static_cast(do_conjugate)); } else { -- cgit v1.2.3 From fe639455bd1e02c22c459f9e29654f82652e0a97 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 5 Aug 2018 21:12:39 +0200 Subject: Added an option to compile the Netlib API with static OpenCL device and context --- CHANGELOG | 1 + CMakeLists.txt | 12 +- scripts/generator/generator.py | 2 +- scripts/generator/generator/cpp.py | 4 +- src/clblast_netlib_c.cpp | 672 +++++++++++++++++++------------------ 5 files changed, 355 insertions(+), 336 deletions(-) (limited to 'CHANGELOG') diff --git a/CHANGELOG b/CHANGELOG index 3134e7bf..64889ee6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ Development (next version) - Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah') +- Added an option to compile the Netlib API with static OpenCL device and context (-DNETLIB_STATIC_OPENCL=ON) - The tuners now check beforehand on invalid local thread sizes and skip those completely - Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel diff --git a/CMakeLists.txt b/CMakeLists.txt index fb62ae27..806b4b26 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,9 +32,19 @@ option(SAMPLES "Enable compilation of the examples" OFF) option(TUNERS "Enable compilation of the tuners" ON) option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF) option(TESTS "Enable compilation of the correctness tests" OFF) -option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF) option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF) +# The optional Netlib API for CLBlast +option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF) +option(NETLIB_STATIC_OPENCL "Makes OpenCL device and context in the CBLAS Netlib API static" OFF) +if(NETLIB) + message("-- Building the Netlib API of CLBlast") + if(NETLIB_STATIC_OPENCL) + message(" ^^ while using static variables for OpenCL device and context") + add_definitions(-DNETLIB_STATIC_OPENCL) + endif() +endif() + # Workarounds for bugs option(AMD_SI_EMPTY_KERNEL_WORKAROUND "Enables workaround for bug in AMD Southern Island GPUs" OFF) if(AMD_SI_EMPTY_KERNEL_WORKAROUND) diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 1db789d2..25a04273 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -49,7 +49,7 @@ FILES = [ "/src/clblast_cuda.cpp", "/src/pyclblast/src/pyclblast.pyx" ] -HEADER_LINES = [123, 21, 127, 24, 29, 45, 29, 65, 32, 95, 21, 290] +HEADER_LINES = [123, 21, 127, 24, 29, 45, 29, 65, 40, 95, 21, 290] FOOTER_LINES = [98, 57, 112, 275, 6, 6, 6, 9, 2, 41, 56, 37] HEADER_LINES_DOC = 0 FOOTER_LINES_DOC = 232 diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 51ca047c..6dc3fc93 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -145,8 +145,8 @@ def clblast_netlib_c_cc(routine): result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL # Initialize OpenCL - result += " auto device = get_device();" + NL - result += " auto context = clblast::Context(device);" + NL + result += " OPTIONAL_STATIC auto device = get_device();" + NL + result += " OPTIONAL_STATIC auto context = clblast::Context(device);" + NL result += " auto queue = clblast::Queue(context, device);" + NL # Set alpha and beta diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp index 9ab663be..aa1f4006 100644 --- a/src/clblast_netlib_c.cpp +++ b/src/clblast_netlib_c.cpp @@ -23,6 +23,14 @@ using float2 = clblast::float2; using double2 = clblast::double2; +// Option to make OpenCL device and context static to avoid re-creation upon multiple calls to the +// Netlib API. Disadvantage is that they are not cleaned-up until program termination. +#ifdef NETLIB_STATIC_OPENCL + #define OPTIONAL_STATIC static +#else + #define OPTIONAL_STATIC +#endif + // Helper function to get a default OpenCL platform and device clblast::Device get_device() { auto platform_id = clblast::ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}); @@ -40,8 +48,8 @@ void cblas_srotg(float* sa, float* sb, float* sc, float* ss) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto sa_size = 1; const auto sb_size = 1; @@ -73,8 +81,8 @@ void cblas_drotg(double* sa, double* sb, double* sc, double* ss) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto sa_size = 1; const auto sb_size = 1; @@ -109,8 +117,8 @@ void cblas_srotmg(float* sd1, float* sx1, const float sy1, float* sparam) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto sy1_size = 1; const auto sd1_size = 1; @@ -148,8 +156,8 @@ void cblas_drotmg(double* sd1, double* sx1, const double sy1, double* sparam) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto sy1_size = 1; const auto sd1_size = 1; @@ -189,8 +197,8 @@ void cblas_srot(const int n, float* y, const int y_inc, const float cos, const float sin) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -216,8 +224,8 @@ void cblas_drot(const int n, double* y, const int y_inc, const double cos, const double sin) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -244,8 +252,8 @@ void cblas_srotm(const int n, float* x, const int x_inc, float* y, const int y_inc, float* sparam) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -273,8 +281,8 @@ void cblas_drotm(const int n, double* x, const int x_inc, double* y, const int y_inc, double* sparam) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -303,8 +311,8 @@ void cblas_drotm(const int n, void cblas_sswap(const int n, float* x, const int x_inc, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -326,8 +334,8 @@ void cblas_sswap(const int n, void cblas_dswap(const int n, double* x, const int x_inc, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -349,8 +357,8 @@ void cblas_dswap(const int n, void cblas_cswap(const int n, void* x, const int x_inc, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -372,8 +380,8 @@ void cblas_cswap(const int n, void cblas_zswap(const int n, void* x, const int x_inc, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -397,8 +405,8 @@ void cblas_zswap(const int n, void cblas_sscal(const int n, const float alpha, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -417,8 +425,8 @@ void cblas_sscal(const int n, void cblas_dscal(const int n, const double alpha, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -437,8 +445,8 @@ void cblas_dscal(const int n, void cblas_cscal(const int n, const void* alpha, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; @@ -457,8 +465,8 @@ void cblas_cscal(const int n, void cblas_zscal(const int n, const void* alpha, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; @@ -479,8 +487,8 @@ void cblas_zscal(const int n, void cblas_scopy(const int n, const float* x, const int x_inc, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -501,8 +509,8 @@ void cblas_scopy(const int n, void cblas_dcopy(const int n, const double* x, const int x_inc, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -523,8 +531,8 @@ void cblas_dcopy(const int n, void cblas_ccopy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -545,8 +553,8 @@ void cblas_ccopy(const int n, void cblas_zcopy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -570,8 +578,8 @@ void cblas_saxpy(const int n, const float alpha, const float* x, const int x_inc, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -595,8 +603,8 @@ void cblas_daxpy(const int n, const double alpha, const double* x, const int x_inc, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -620,8 +628,8 @@ void cblas_caxpy(const int n, const void* alpha, const void* x, const int x_inc, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; @@ -645,8 +653,8 @@ void cblas_zaxpy(const int n, const void* alpha, const void* x, const int x_inc, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; @@ -671,8 +679,8 @@ void cblas_zaxpy(const int n, float cblas_sdot(const int n, const float* x, const int x_inc, const float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -698,8 +706,8 @@ float cblas_sdot(const int n, double cblas_ddot(const int n, const double* x, const int x_inc, const double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -728,8 +736,8 @@ void cblas_cdotu_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -754,8 +762,8 @@ void cblas_zdotu_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -782,8 +790,8 @@ void cblas_cdotc_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -808,8 +816,8 @@ void cblas_zdotc_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -834,8 +842,8 @@ void cblas_zdotc_sub(const int n, // NRM2 float cblas_snrm2(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto nrm2_size = 1; @@ -856,8 +864,8 @@ float cblas_snrm2(const int n, } double cblas_dnrm2(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto nrm2_size = 1; @@ -878,8 +886,8 @@ double cblas_dnrm2(const int n, } float cblas_scnrm2(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto nrm2_size = 1; @@ -900,8 +908,8 @@ float cblas_scnrm2(const int n, } double cblas_dznrm2(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto nrm2_size = 1; @@ -924,8 +932,8 @@ double cblas_dznrm2(const int n, // ASUM float cblas_sasum(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto asum_size = 1; @@ -946,8 +954,8 @@ float cblas_sasum(const int n, } double cblas_dasum(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto asum_size = 1; @@ -968,8 +976,8 @@ double cblas_dasum(const int n, } float cblas_scasum(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto asum_size = 1; @@ -990,8 +998,8 @@ float cblas_scasum(const int n, } double cblas_dzasum(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto asum_size = 1; @@ -1014,8 +1022,8 @@ double cblas_dzasum(const int n, // SUM float cblas_ssum(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto sum_size = 1; @@ -1036,8 +1044,8 @@ float cblas_ssum(const int n, } double cblas_dsum(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto sum_size = 1; @@ -1058,8 +1066,8 @@ double cblas_dsum(const int n, } float cblas_scsum(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto sum_size = 1; @@ -1080,8 +1088,8 @@ float cblas_scsum(const int n, } double cblas_dzsum(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto sum_size = 1; @@ -1104,8 +1112,8 @@ double cblas_dzsum(const int n, // AMAX int cblas_isamax(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1126,8 +1134,8 @@ int cblas_isamax(const int n, } int cblas_idamax(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1148,8 +1156,8 @@ int cblas_idamax(const int n, } int cblas_icamax(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1170,8 +1178,8 @@ int cblas_icamax(const int n, } int cblas_izamax(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1194,8 +1202,8 @@ int cblas_izamax(const int n, // AMIN int cblas_isamin(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1216,8 +1224,8 @@ int cblas_isamin(const int n, } int cblas_idamin(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1238,8 +1246,8 @@ int cblas_idamin(const int n, } int cblas_icamin(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1260,8 +1268,8 @@ int cblas_icamin(const int n, } int cblas_izamin(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1284,8 +1292,8 @@ int cblas_izamin(const int n, // MAX int cblas_ismax(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1306,8 +1314,8 @@ int cblas_ismax(const int n, } int cblas_idmax(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1328,8 +1336,8 @@ int cblas_idmax(const int n, } int cblas_icmax(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1350,8 +1358,8 @@ int cblas_icmax(const int n, } int cblas_izmax(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1374,8 +1382,8 @@ int cblas_izmax(const int n, // MIN int cblas_ismin(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1396,8 +1404,8 @@ int cblas_ismin(const int n, } int cblas_idmin(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1418,8 +1426,8 @@ int cblas_idmin(const int n, } int cblas_icmin(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1440,8 +1448,8 @@ int cblas_icmin(const int n, } int cblas_izmin(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1473,8 +1481,8 @@ void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -1509,8 +1517,8 @@ void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -1545,8 +1553,8 @@ void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -1581,8 +1589,8 @@ void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -1619,8 +1627,8 @@ void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -1655,8 +1663,8 @@ void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -1691,8 +1699,8 @@ void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -1727,8 +1735,8 @@ void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -1765,8 +1773,8 @@ void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -1801,8 +1809,8 @@ void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -1839,8 +1847,8 @@ void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -1875,8 +1883,8 @@ void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -1913,8 +1921,8 @@ void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -1949,8 +1957,8 @@ void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -1987,8 +1995,8 @@ void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -2023,8 +2031,8 @@ void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -2061,8 +2069,8 @@ void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -2097,8 +2105,8 @@ void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -2135,8 +2143,8 @@ void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -2171,8 +2179,8 @@ void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -2206,8 +2214,8 @@ void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const float* a, const int a_ld, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2233,8 +2241,8 @@ void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const double* a, const int a_ld, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2260,8 +2268,8 @@ void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2287,8 +2295,8 @@ void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2316,8 +2324,8 @@ void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2343,8 +2351,8 @@ void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2370,8 +2378,8 @@ void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2397,8 +2405,8 @@ void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2426,8 +2434,8 @@ void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const float* ap, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2453,8 +2461,8 @@ void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const double* ap, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2480,8 +2488,8 @@ void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* ap, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2507,8 +2515,8 @@ void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* ap, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2536,8 +2544,8 @@ void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const float* a, const int a_ld, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2563,8 +2571,8 @@ void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const double* a, const int a_ld, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2590,8 +2598,8 @@ void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2617,8 +2625,8 @@ void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2646,8 +2654,8 @@ void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2673,8 +2681,8 @@ void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2700,8 +2708,8 @@ void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2727,8 +2735,8 @@ void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2756,8 +2764,8 @@ void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const float* ap, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2783,8 +2791,8 @@ void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const double* ap, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2810,8 +2818,8 @@ void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* ap, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2837,8 +2845,8 @@ void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* ap, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2868,8 +2876,8 @@ void cblas_sger(const CLBlastLayout layout, const float* x, const int x_inc, const float* y, const int y_inc, float* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; @@ -2900,8 +2908,8 @@ void cblas_dger(const CLBlastLayout layout, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; @@ -2934,8 +2942,8 @@ void cblas_cgeru(const CLBlastLayout layout, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; @@ -2966,8 +2974,8 @@ void cblas_zgeru(const CLBlastLayout layout, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; @@ -3000,8 +3008,8 @@ void cblas_cgerc(const CLBlastLayout layout, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; @@ -3032,8 +3040,8 @@ void cblas_zgerc(const CLBlastLayout layout, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; @@ -3065,8 +3073,8 @@ void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const float alpha, const void* x, const int x_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3093,8 +3101,8 @@ void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const double alpha, const void* x, const int x_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3123,8 +3131,8 @@ void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const float alpha, const void* x, const int x_inc, void* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3151,8 +3159,8 @@ void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const double alpha, const void* x, const int x_inc, void* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3182,8 +3190,8 @@ void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; @@ -3215,8 +3223,8 @@ void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; @@ -3250,8 +3258,8 @@ void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* y, const int y_inc, void* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; @@ -3283,8 +3291,8 @@ void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* y, const int y_inc, void* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; @@ -3317,8 +3325,8 @@ void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const float alpha, const float* x, const int x_inc, float* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3345,8 +3353,8 @@ void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const double alpha, const double* x, const int x_inc, double* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3375,8 +3383,8 @@ void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const float alpha, const float* x, const int x_inc, float* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3403,8 +3411,8 @@ void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const double alpha, const double* x, const int x_inc, double* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3434,8 +3442,8 @@ void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, const float* y, const int y_inc, float* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3467,8 +3475,8 @@ void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3502,8 +3510,8 @@ void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, const float* y, const int y_inc, float* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3535,8 +3543,8 @@ void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, const double* y, const int y_inc, double* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3575,8 +3583,8 @@ void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const float* b, const int b_ld, const float beta, float* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -3612,8 +3620,8 @@ void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const double* b, const int b_ld, const double beta, double* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -3649,8 +3657,8 @@ void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -3686,8 +3694,8 @@ void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -3725,8 +3733,8 @@ void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const float* b, const int b_ld, const float beta, float* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -3762,8 +3770,8 @@ void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const double* b, const int b_ld, const double beta, double* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -3799,8 +3807,8 @@ void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -3836,8 +3844,8 @@ void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -3875,8 +3883,8 @@ void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -3912,8 +3920,8 @@ void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -3950,8 +3958,8 @@ void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* a, const int a_ld, const float beta, float* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -3982,8 +3990,8 @@ void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* a, const int a_ld, const double beta, double* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4014,8 +4022,8 @@ void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -4046,8 +4054,8 @@ void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -4080,8 +4088,8 @@ void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, const float beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4112,8 +4120,8 @@ void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, const double beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4147,8 +4155,8 @@ void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const float* b, const int b_ld, const float beta, float* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4184,8 +4192,8 @@ void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const double* b, const int b_ld, const double beta, double* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4221,8 +4229,8 @@ void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -4258,8 +4266,8 @@ void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -4297,8 +4305,8 @@ void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* b, const int b_ld, const float beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; @@ -4334,8 +4342,8 @@ void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* b, const int b_ld, const double beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; @@ -4371,8 +4379,8 @@ void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const float alpha, const float* a, const int a_ld, float* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4402,8 +4410,8 @@ void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const double alpha, const double* a, const int a_ld, double* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4433,8 +4441,8 @@ void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4464,8 +4472,8 @@ void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4497,8 +4505,8 @@ void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const float alpha, const float* a, const int a_ld, float* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4528,8 +4536,8 @@ void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const double alpha, const double* a, const int a_ld, double* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4559,8 +4567,8 @@ void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4590,8 +4598,8 @@ void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4628,8 +4636,8 @@ void cblas_shad(const int n, const float* y, const int y_inc, const float beta, float* z, const int z_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4661,8 +4669,8 @@ void cblas_dhad(const int n, const double* y, const int y_inc, const double beta, double* z, const int z_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4694,8 +4702,8 @@ void cblas_chad(const int n, const void* y, const int y_inc, const void* beta, void* z, const int z_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -4727,8 +4735,8 @@ void cblas_zhad(const int n, const void* y, const int y_inc, const void* beta, void* z, const int z_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; @@ -4761,8 +4769,8 @@ void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const float alpha, const float* a, const int a_ld, float* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; @@ -4789,8 +4797,8 @@ void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const double alpha, const double* a, const int a_ld, double* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; @@ -4817,8 +4825,8 @@ void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; @@ -4845,8 +4853,8 @@ void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; @@ -4873,8 +4881,8 @@ void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp void cblas_sim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const float* im, float* col) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto im_size = height * width * channels; const auto col_size = height * width * channels; @@ -4895,8 +4903,8 @@ void cblas_sim2col(const int channels, const int height, const int width, const void cblas_dim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const double* im, double* col) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto im_size = height * width * channels; const auto col_size = height * width * channels; @@ -4917,8 +4925,8 @@ void cblas_dim2col(const int channels, const int height, const int width, const void cblas_cim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const void* im, void* col) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto im_size = height * width * channels; const auto col_size = height * width * channels; @@ -4939,8 +4947,8 @@ void cblas_cim2col(const int channels, const int height, const int width, const void cblas_zim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const void* im, void* col) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto im_size = height * width * channels; const auto col_size = height * width * channels; -- cgit v1.2.3 From 9d9f09fce9d42247701c16d89ca356cdd3a76b4b Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 7 Aug 2018 22:41:06 +0200 Subject: Name change of setting to NETLIB_PERSISTENT_OPENCL --- CHANGELOG | 2 +- CMakeLists.txt | 6 +++--- src/clblast_netlib_c.cpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'CHANGELOG') diff --git a/CHANGELOG b/CHANGELOG index 64889ee6..f2960fde 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,7 @@ Development (next version) - Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah') -- Added an option to compile the Netlib API with static OpenCL device and context (-DNETLIB_STATIC_OPENCL=ON) +- Added an option to compile the Netlib API with static OpenCL device and context (-DNETLIB_PERSISTENT_OPENCL=ON) - The tuners now check beforehand on invalid local thread sizes and skip those completely - Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel diff --git a/CMakeLists.txt b/CMakeLists.txt index 806b4b26..fd201021 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,12 +36,12 @@ option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF # The optional Netlib API for CLBlast option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF) -option(NETLIB_STATIC_OPENCL "Makes OpenCL device and context in the CBLAS Netlib API static" OFF) +option(NETLIB_PERSISTENT_OPENCL "Makes OpenCL device and context in the CBLAS Netlib API static" OFF) if(NETLIB) message("-- Building the Netlib API of CLBlast") - if(NETLIB_STATIC_OPENCL) + if(NETLIB_PERSISTENT_OPENCL) message(" ^^ while using static variables for OpenCL device and context") - add_definitions(-DNETLIB_STATIC_OPENCL) + add_definitions(-DNETLIB_PERSISTENT_OPENCL) endif() endif() diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp index aa1f4006..dbc2ba57 100644 --- a/src/clblast_netlib_c.cpp +++ b/src/clblast_netlib_c.cpp @@ -25,7 +25,7 @@ using double2 = clblast::double2; // Option to make OpenCL device and context static to avoid re-creation upon multiple calls to the // Netlib API. Disadvantage is that they are not cleaned-up until program termination. -#ifdef NETLIB_STATIC_OPENCL +#ifdef NETLIB_PERSISTENT_OPENCL #define OPTIONAL_STATIC static #else #define OPTIONAL_STATIC -- cgit v1.2.3 From 8ac39fa3310ba4a66992ccfce839195c31acf688 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 15 Sep 2018 16:53:09 +0200 Subject: Disabled Intel subgroup shuffling for double-precision --- CHANGELOG | 1 + src/utilities/compile.cpp | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'CHANGELOG') diff --git a/CHANGELOG b/CHANGELOG index f2960fde..27860c85 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -5,6 +5,7 @@ Development (next version) - The tuners now check beforehand on invalid local thread sizes and skip those completely - Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel +- Fixed an issue with the preprocessor and the new GEMMK == 1 kernel - Various minor fixes and enhancements Version 1.4.1 diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp index 835f54b4..00cb90cb 100644 --- a/src/utilities/compile.cpp +++ b/src/utilities/compile.cpp @@ -59,7 +59,8 @@ std::shared_ptr CompileFromSource( } // For Intel GPUs with subgroup support, use subgroup shuffling. - if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) { + if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups) && + (precision == Precision::kSingle || precision == Precision::kHalf)) { header_string += "#define USE_SUBGROUP_SHUFFLING 1\n"; header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n"; } -- cgit v1.2.3