summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-09-16 20:01:18 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2018-09-16 20:01:18 +0200
commit83ba3d4b7ba3a9cb5fbd2c1ad2bb14b2addd39fb (patch)
tree58900a63158d08e76342b46372fcc59015b4d3ca
parentb7d833901213d03fe5e7f10c15741f55c6c1eb54 (diff)
parentc163868e1822a97750b4380f0d9cdd38369f9f0b (diff)
Merge branch 'master' into convgemm_multi_kernel
-rw-r--r--CHANGELOG4
-rw-r--r--CMakeLists.txt18
-rw-r--r--README.md4
-rw-r--r--doc/api.md4
-rw-r--r--doc/bindings.md6
-rw-r--r--doc/tuning.md20
-rwxr-xr-xscripts/generator/generator.py2
-rw-r--r--scripts/generator/generator/cpp.py4
-rw-r--r--src/clblast_netlib_c.cpp672
-rw-r--r--src/clpp11.hpp37
-rw-r--r--src/database/kernels/copy/copy_3232.hpp1
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_32.hpp3
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_3232.hpp3
-rw-r--r--src/database/kernels/invert/invert_32.hpp1
-rw-r--r--src/database/kernels/invert/invert_3232.hpp1
-rw-r--r--src/database/kernels/pad/pad_3232.hpp1
-rw-r--r--src/database/kernels/padtranspose/padtranspose_3232.hpp3
-rw-r--r--src/database/kernels/transpose/transpose_3232.hpp1
-rw-r--r--src/database/kernels/trsv_routine/trsv_routine_32.hpp1
-rw-r--r--src/database/kernels/trsv_routine/trsv_routine_3232.hpp1
-rw-r--r--src/database/kernels/xaxpy/xaxpy_3232.hpp1
-rw-r--r--src/database/kernels/xdot/xdot_32.hpp2
-rw-r--r--src/database/kernels/xdot/xdot_3232.hpp1
-rw-r--r--src/database/kernels/xgemm/xgemm_32.hpp2
-rw-r--r--src/database/kernels/xgemm/xgemm_3232.hpp3
-rw-r--r--src/database/kernels/xgemm_direct/xgemm_direct_32.hpp2
-rw-r--r--src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp3
-rw-r--r--src/database/kernels/xgemv/xgemv_3232.hpp1
-rw-r--r--src/database/kernels/xgemv_fast/xgemv_fast_32.hpp2
-rw-r--r--src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp1
-rw-r--r--src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp2
-rw-r--r--src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp1
-rw-r--r--src/database/kernels/xger/xger_3232.hpp1
-rw-r--r--src/kernel_preprocessor.cpp2
-rw-r--r--src/kernels/level3/xgemm_part1.opencl6
-rw-r--r--src/kernels/level3/xgemm_part2.opencl6
-rw-r--r--src/kernels/level3/xgemm_part3.opencl73
-rw-r--r--src/kernels/level3/xgemm_part4.opencl4
-rw-r--r--src/routines/common.hpp9
-rw-r--r--src/routines/level2/xtrsv.cpp13
-rw-r--r--src/routines/level2/xtrsv.hpp4
-rw-r--r--src/routines/level3/xgemm.cpp1
-rw-r--r--src/routines/level3/xherk.cpp1
-rw-r--r--src/routines/level3/xsyrk.cpp1
-rw-r--r--src/routines/level3/xtrsm.cpp2
-rw-r--r--src/routines/level3/xtrsm.hpp1
-rw-r--r--src/routines/levelx/xgemmbatched.cpp1
-rw-r--r--src/routines/levelx/xgemmstridedbatched.cpp1
-rw-r--r--src/tuning/configurations.cpp49
-rw-r--r--src/tuning/configurations.hpp17
-rw-r--r--src/tuning/kernels/xgemm.cpp4
-rw-r--r--src/tuning/kernels/xgemm.hpp2
-rw-r--r--src/tuning/tuning.cpp3
-rw-r--r--src/tuning/tuning_api.cpp3
-rw-r--r--src/utilities/compile.cpp3
-rw-r--r--test/correctness/misc/preprocessor.cpp14
-rw-r--r--test/correctness/testblas.cpp2
-rw-r--r--test/routines/levelx/xomatcopy.hpp4
-rw-r--r--test/test_utilities.cpp10
-rw-r--r--test/test_utilities.hpp4
60 files changed, 618 insertions, 431 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 53958d6f..63179e95 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,7 +1,11 @@
Development (next version)
- Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')
+- Added an option to compile the Netlib API with static OpenCL device and context (-DNETLIB_PERSISTENT_OPENCL=ON)
+- The tuners now check beforehand on invalid local thread sizes and skip those completely
+- Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY
- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
+- Fixed an issue with the preprocessor and the new GEMMK == 1 kernel
- Various minor fixes and enhancements
- Added non-BLAS routines:
* SCONVGEMM/DCONVGEMM/HCONVGEMM (convolution as im2col followed by batched GEMM)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ac775b63..0f067efb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,9 +32,25 @@ option(SAMPLES "Enable compilation of the examples" OFF)
option(TUNERS "Enable compilation of the tuners" ON)
option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
option(TESTS "Enable compilation of the correctness tests" OFF)
-option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF)
+# The optional Netlib API for CLBlast
+option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
+option(NETLIB_PERSISTENT_OPENCL "Makes OpenCL device and context in the CBLAS Netlib API static" OFF)
+if(NETLIB)
+ message("-- Building the Netlib API of CLBlast")
+ if(NETLIB_PERSISTENT_OPENCL)
+ message(" ^^ while using static variables for OpenCL device and context")
+ add_definitions(-DNETLIB_PERSISTENT_OPENCL)
+ endif()
+endif()
+
+# Workarounds for bugs
+option(AMD_SI_EMPTY_KERNEL_WORKAROUND "Enables workaround for bug in AMD Southern Island GPUs" OFF)
+if(AMD_SI_EMPTY_KERNEL_WORKAROUND)
+ add_definitions(-DAMD_SI_EMPTY_KERNEL_WORKAROUND)
+endif()
+
# Select between an OpenCL API (default) or a CUDA API (beta)
option(OPENCL "Build CLBlast with an OpenCL API (default)" ON)
option(CUDA "Build CLBlast with a CUDA API (beta)" OFF)
diff --git a/README.md b/README.md
index 28fd42d2..85dc4386 100644
--- a/README.md
+++ b/README.md
@@ -101,6 +101,10 @@ Other known issues:
* The AMD run-time compiler has a bug causing it to get stuck in an infinite loop. This is reported to happen occasionally when tuning the CLBlast GEMM routine.
+* AMD Southern Island GPUs might cause wrong results with the amdgpu-pro drivers. Do configure CMake with `AMD_SI_EMPTY_KERNEL_WORKAROUND` to resolve the issue, [see issue #301](https://github.com/CNugteren/CLBlast/issues/301).
+
+* Tests might fail on an Intel IvyBridge GPU with the latest Beignet. Please downgrade Beignet to 1.2.1, [see issue #231](https://github.com/CNugteren/CLBlast/issues/231).
+
Contributing
-------------
diff --git a/doc/api.md b/doc/api.md
index 02bca018..15bc0dcd 100644
--- a/doc/api.md
+++ b/doc/api.md
@@ -3512,7 +3512,7 @@ Arguments to FillCache:
RetrieveParameters: Retrieves current tuning parameters (auxiliary function)
-------------
-This function retrieves current tuning parameters for a specific device-precision-kernel combination. This can be used for debugging or inspection.
+This function retrieves current tuning parameters for a specific device-precision-kernel combination. This can be used for debugging or inspection. See [tuning.md](tuning.md) for more details on which kernel names and parameters are valid.
C++ API:
```
@@ -3535,7 +3535,7 @@ Arguments to RetrieveParameters (C++ version):
OverrideParameters: Override tuning parameters (auxiliary function)
-------------
-This function overrides tuning parameters for a specific device-precision-kernel combination. The next time the target routine is called it will be re-compiled and use the new parameters. All further times (until `OverrideParameters` is called again) it will load the kernel from the cache and thus continue to use the new parameters. Note that the first time after calling `OverrideParameters` a performance drop can be observable due to the re-compilation of the kernel.
+This function overrides tuning parameters for a specific device-precision-kernel combination. The next time the target routine is called it will be re-compiled and use the new parameters. All further times (until `OverrideParameters` is called again) it will load the kernel from the cache and thus continue to use the new parameters. Note that the first time after calling `OverrideParameters` a performance drop can be observable due to the re-compilation of the kernel. See [tuning.md](tuning.md) for more details on which kernel names and parameters are valid.
C++ API:
```
diff --git a/doc/bindings.md b/doc/bindings.md
index 3bd3fc7b..85508e68 100644
--- a/doc/bindings.md
+++ b/doc/bindings.md
@@ -30,3 +30,9 @@ Nim: nim-CLBlast (3rd party)
-------------
A 3rd party CLBlast wrapper for the nim language is available [here](https://github.com/numforge/nim-clblast).
+
+
+Julia: CLBlast.jl (3rd party)
+-------------
+
+A 3rd party CLBlast wrapper for [Julia](https://julialang.org/) is available [here](https://github.com/JuliaGPU/CLBlast.jl).
diff --git a/doc/tuning.md b/doc/tuning.md
index 938c3b6a..3117ffad 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -195,6 +195,26 @@ To inspect current behaviour, you can also retrieve the parameters for a specifi
const Precision precision,
std::unordered_map<std::string,size_t> &parameters)
+These two functions require/retrieve the parameters as given in [src/database/kernels](../src/database/kernels), i.e.:
+
+| Kernel name | Parameters |
+| --------------------|-----------------------|
+| Xaxpy | VW, WGS, WPT |
+| Xdot | WGS1, WGS2 |
+| Xgemv | WGS1, WPT1, UNROLL1 |
+| XgemvFast | VW2, WGS2, WPT2 |
+| XgemvFastRot | VW3, WGS3, WPT3 |
+| Xger | WGS1, WGS2, WPT |
+| Xtrsv | TRSV_BLOCK_SIZE |
+| Xgemm | GEMMK, KREG, KWG, KWI, MDIMA, MDIMC, MWG, NDIMB, NDIMC, NWG, SA, SB, STRM, STRN, VWM, VWN |
+| XgemmDirect | KWID, MDIMAD, MDIMCD, NDIMBD, NDIMCD, PADA, PADB, VWMD, VWND, WGD |
+| Copy | COPY_DIMX, COPY_DIMY, COPY_VW, COPY_WPT |
+| Pad | PAD_DIMX, PAD_DIMY, PAD_WPTX, PAD_WPTY |
+| Transpose | TRA_DIM, TRA_PAD, TRA_SHUFFLE, TRA_WPT |
+| Padtranspose | PADTRA_PAD, PADTRA_TILE, PADTRA_WPT |
+| Invert | INTERNAL_BLOCK_SIZE |
+| TrsvRoutine | TRSV_BLOCK_SIZE |
+
Tuning OpenCL compiler options
-------------
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 004459c3..c2637037 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -49,7 +49,7 @@ FILES = [
"/src/clblast_cuda.cpp",
"/src/pyclblast/src/pyclblast.pyx"
]
-HEADER_LINES = [123, 21, 127, 24, 29, 45, 29, 65, 32, 95, 21, 290]
+HEADER_LINES = [123, 21, 127, 24, 29, 45, 29, 65, 40, 95, 21, 290]
FOOTER_LINES = [98, 57, 112, 275, 6, 6, 6, 9, 2, 41, 56, 37]
HEADER_LINES_DOC = 0
FOOTER_LINES_DOC = 232
diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py
index 51ca047c..6dc3fc93 100644
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@@ -145,8 +145,8 @@ def clblast_netlib_c_cc(routine):
result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL
# Initialize OpenCL
- result += " auto device = get_device();" + NL
- result += " auto context = clblast::Context(device);" + NL
+ result += " OPTIONAL_STATIC auto device = get_device();" + NL
+ result += " OPTIONAL_STATIC auto context = clblast::Context(device);" + NL
result += " auto queue = clblast::Queue(context, device);" + NL
# Set alpha and beta
diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp
index 9ab663be..dbc2ba57 100644
--- a/src/clblast_netlib_c.cpp
+++ b/src/clblast_netlib_c.cpp
@@ -23,6 +23,14 @@
using float2 = clblast::float2;
using double2 = clblast::double2;
+// Option to make OpenCL device and context static to avoid re-creation upon multiple calls to the
+// Netlib API. Disadvantage is that they are not cleaned-up until program termination.
+#ifdef NETLIB_PERSISTENT_OPENCL
+ #define OPTIONAL_STATIC static
+#else
+ #define OPTIONAL_STATIC
+#endif
+
// Helper function to get a default OpenCL platform and device
clblast::Device get_device() {
auto platform_id = clblast::ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0});
@@ -40,8 +48,8 @@ void cblas_srotg(float* sa,
float* sb,
float* sc,
float* ss) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto sa_size = 1;
const auto sb_size = 1;
@@ -73,8 +81,8 @@ void cblas_drotg(double* sa,
double* sb,
double* sc,
double* ss) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto sa_size = 1;
const auto sb_size = 1;
@@ -109,8 +117,8 @@ void cblas_srotmg(float* sd1,
float* sx1,
const float sy1,
float* sparam) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto sy1_size = 1;
const auto sd1_size = 1;
@@ -148,8 +156,8 @@ void cblas_drotmg(double* sd1,
double* sx1,
const double sy1,
double* sparam) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto sy1_size = 1;
const auto sd1_size = 1;
@@ -189,8 +197,8 @@ void cblas_srot(const int n,
float* y, const int y_inc,
const float cos,
const float sin) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -216,8 +224,8 @@ void cblas_drot(const int n,
double* y, const int y_inc,
const double cos,
const double sin) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -244,8 +252,8 @@ void cblas_srotm(const int n,
float* x, const int x_inc,
float* y, const int y_inc,
float* sparam) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -273,8 +281,8 @@ void cblas_drotm(const int n,
double* x, const int x_inc,
double* y, const int y_inc,
double* sparam) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -303,8 +311,8 @@ void cblas_drotm(const int n,
void cblas_sswap(const int n,
float* x, const int x_inc,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -326,8 +334,8 @@ void cblas_sswap(const int n,
void cblas_dswap(const int n,
double* x, const int x_inc,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -349,8 +357,8 @@ void cblas_dswap(const int n,
void cblas_cswap(const int n,
void* x, const int x_inc,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -372,8 +380,8 @@ void cblas_cswap(const int n,
void cblas_zswap(const int n,
void* x, const int x_inc,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -397,8 +405,8 @@ void cblas_zswap(const int n,
void cblas_sscal(const int n,
const float alpha,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -417,8 +425,8 @@ void cblas_sscal(const int n,
void cblas_dscal(const int n,
const double alpha,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -437,8 +445,8 @@ void cblas_dscal(const int n,
void cblas_cscal(const int n,
const void* alpha,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -457,8 +465,8 @@ void cblas_cscal(const int n,
void cblas_zscal(const int n,
const void* alpha,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -479,8 +487,8 @@ void cblas_zscal(const int n,
void cblas_scopy(const int n,
const float* x, const int x_inc,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -501,8 +509,8 @@ void cblas_scopy(const int n,
void cblas_dcopy(const int n,
const double* x, const int x_inc,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -523,8 +531,8 @@ void cblas_dcopy(const int n,
void cblas_ccopy(const int n,
const void* x, const int x_inc,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -545,8 +553,8 @@ void cblas_ccopy(const int n,
void cblas_zcopy(const int n,
const void* x, const int x_inc,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -570,8 +578,8 @@ void cblas_saxpy(const int n,
const float alpha,
const float* x, const int x_inc,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -595,8 +603,8 @@ void cblas_daxpy(const int n,
const double alpha,
const double* x, const int x_inc,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -620,8 +628,8 @@ void cblas_caxpy(const int n,
const void* alpha,
const void* x, const int x_inc,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -645,8 +653,8 @@ void cblas_zaxpy(const int n,
const void* alpha,
const void* x, const int x_inc,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -671,8 +679,8 @@ void cblas_zaxpy(const int n,
float cblas_sdot(const int n,
const float* x, const int x_inc,
const float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -698,8 +706,8 @@ float cblas_sdot(const int n,
double cblas_ddot(const int n,
const double* x, const int x_inc,
const double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -728,8 +736,8 @@ void cblas_cdotu_sub(const int n,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* dot) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -754,8 +762,8 @@ void cblas_zdotu_sub(const int n,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* dot) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -782,8 +790,8 @@ void cblas_cdotc_sub(const int n,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* dot) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -808,8 +816,8 @@ void cblas_zdotc_sub(const int n,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* dot) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -834,8 +842,8 @@ void cblas_zdotc_sub(const int n,
// NRM2
float cblas_snrm2(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto nrm2_size = 1;
@@ -856,8 +864,8 @@ float cblas_snrm2(const int n,
}
double cblas_dnrm2(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto nrm2_size = 1;
@@ -878,8 +886,8 @@ double cblas_dnrm2(const int n,
}
float cblas_scnrm2(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto nrm2_size = 1;
@@ -900,8 +908,8 @@ float cblas_scnrm2(const int n,
}
double cblas_dznrm2(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto nrm2_size = 1;
@@ -924,8 +932,8 @@ double cblas_dznrm2(const int n,
// ASUM
float cblas_sasum(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto asum_size = 1;
@@ -946,8 +954,8 @@ float cblas_sasum(const int n,
}
double cblas_dasum(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto asum_size = 1;
@@ -968,8 +976,8 @@ double cblas_dasum(const int n,
}
float cblas_scasum(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto asum_size = 1;
@@ -990,8 +998,8 @@ float cblas_scasum(const int n,
}
double cblas_dzasum(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto asum_size = 1;
@@ -1014,8 +1022,8 @@ double cblas_dzasum(const int n,
// SUM
float cblas_ssum(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto sum_size = 1;
@@ -1036,8 +1044,8 @@ float cblas_ssum(const int n,
}
double cblas_dsum(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto sum_size = 1;
@@ -1058,8 +1066,8 @@ double cblas_dsum(const int n,
}
float cblas_scsum(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto sum_size = 1;
@@ -1080,8 +1088,8 @@ float cblas_scsum(const int n,
}
double cblas_dzsum(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto sum_size = 1;
@@ -1104,8 +1112,8 @@ double cblas_dzsum(const int n,
// AMAX
int cblas_isamax(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1126,8 +1134,8 @@ int cblas_isamax(const int n,
}
int cblas_idamax(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1148,8 +1156,8 @@ int cblas_idamax(const int n,
}
int cblas_icamax(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1170,8 +1178,8 @@ int cblas_icamax(const int n,
}
int cblas_izamax(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1194,8 +1202,8 @@ int cblas_izamax(const int n,
// AMIN
int cblas_isamin(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1216,8 +1224,8 @@ int cblas_isamin(const int n,
}
int cblas_idamin(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1238,8 +1246,8 @@ int cblas_idamin(const int n,
}
int cblas_icamin(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1260,8 +1268,8 @@ int cblas_icamin(const int n,
}
int cblas_izamin(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1284,8 +1292,8 @@ int cblas_izamin(const int n,
// MAX
int cblas_ismax(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1306,8 +1314,8 @@ int cblas_ismax(const int n,
}
int cblas_idmax(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1328,8 +1336,8 @@ int cblas_idmax(const int n,
}
int cblas_icmax(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1350,8 +1358,8 @@ int cblas_icmax(const int n,
}
int cblas_izmax(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1374,8 +1382,8 @@ int cblas_izmax(const int n,
// MIN
int cblas_ismin(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1396,8 +1404,8 @@ int cblas_ismin(const int n,
}
int cblas_idmin(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1418,8 +1426,8 @@ int cblas_idmin(const int n,
}
int cblas_icmin(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1440,8 +1448,8 @@ int cblas_icmin(const int n,
}
int cblas_izmin(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1473,8 +1481,8 @@ void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -1509,8 +1517,8 @@ void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -1545,8 +1553,8 @@ void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -1581,8 +1589,8 @@ void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -1619,8 +1627,8 @@ void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -1655,8 +1663,8 @@ void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -1691,8 +1699,8 @@ void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -1727,8 +1735,8 @@ void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -1765,8 +1773,8 @@ void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -1801,8 +1809,8 @@ void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -1839,8 +1847,8 @@ void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -1875,8 +1883,8 @@ void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -1913,8 +1921,8 @@ void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -1949,8 +1957,8 @@ void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -1987,8 +1995,8 @@ void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -2023,8 +2031,8 @@ void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -2061,8 +2069,8 @@ void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -2097,8 +2105,8 @@ void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -2135,8 +2143,8 @@ void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -2171,8 +2179,8 @@ void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -2206,8 +2214,8 @@ void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const float* a, const int a_ld,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2233,8 +2241,8 @@ void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const double* a, const int a_ld,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2260,8 +2268,8 @@ void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2287,8 +2295,8 @@ void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2316,8 +2324,8 @@ void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const float* a, const int a_ld,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2343,8 +2351,8 @@ void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const double* a, const int a_ld,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2370,8 +2378,8 @@ void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2397,8 +2405,8 @@ void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2426,8 +2434,8 @@ void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const float* ap,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2453,8 +2461,8 @@ void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const double* ap,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2480,8 +2488,8 @@ void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* ap,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2507,8 +2515,8 @@ void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* ap,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2536,8 +2544,8 @@ void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const float* a, const int a_ld,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2563,8 +2571,8 @@ void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const double* a, const int a_ld,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2590,8 +2598,8 @@ void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2617,8 +2625,8 @@ void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2646,8 +2654,8 @@ void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const float* a, const int a_ld,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2673,8 +2681,8 @@ void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const double* a, const int a_ld,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2700,8 +2708,8 @@ void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2727,8 +2735,8 @@ void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2756,8 +2764,8 @@ void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const float* ap,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2783,8 +2791,8 @@ void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const double* ap,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2810,8 +2818,8 @@ void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* ap,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2837,8 +2845,8 @@ void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* ap,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2868,8 +2876,8 @@ void cblas_sger(const CLBlastLayout layout,
const float* x, const int x_inc,
const float* y, const int y_inc,
float* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = m * x_inc;
@@ -2900,8 +2908,8 @@ void cblas_dger(const CLBlastLayout layout,
const double* x, const int x_inc,
const double* y, const int y_inc,
double* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = m * x_inc;
@@ -2934,8 +2942,8 @@ void cblas_cgeru(const CLBlastLayout layout,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto x_size = m * x_inc;
@@ -2966,8 +2974,8 @@ void cblas_zgeru(const CLBlastLayout layout,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto x_size = m * x_inc;
@@ -3000,8 +3008,8 @@ void cblas_cgerc(const CLBlastLayout layout,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto x_size = m * x_inc;
@@ -3032,8 +3040,8 @@ void cblas_zgerc(const CLBlastLayout layout,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto x_size = m * x_inc;
@@ -3065,8 +3073,8 @@ void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float alpha,
const void* x, const int x_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3093,8 +3101,8 @@ void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double alpha,
const void* x, const int x_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3123,8 +3131,8 @@ void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float alpha,
const void* x, const int x_inc,
void* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3151,8 +3159,8 @@ void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double alpha,
const void* x, const int x_inc,
void* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3182,8 +3190,8 @@ void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -3215,8 +3223,8 @@ void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -3250,8 +3258,8 @@ void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -3283,8 +3291,8 @@ void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -3317,8 +3325,8 @@ void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float alpha,
const float* x, const int x_inc,
float* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3345,8 +3353,8 @@ void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double alpha,
const double* x, const int x_inc,
double* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3375,8 +3383,8 @@ void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float alpha,
const float* x, const int x_inc,
float* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3403,8 +3411,8 @@ void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double alpha,
const double* x, const int x_inc,
double* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3434,8 +3442,8 @@ void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float* x, const int x_inc,
const float* y, const int y_inc,
float* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3467,8 +3475,8 @@ void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double* x, const int x_inc,
const double* y, const int y_inc,
double* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3502,8 +3510,8 @@ void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float* x, const int x_inc,
const float* y, const int y_inc,
float* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3535,8 +3543,8 @@ void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double* x, const int x_inc,
const double* y, const int y_inc,
double* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3575,8 +3583,8 @@ void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const float* b, const int b_ld,
const float beta,
float* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -3612,8 +3620,8 @@ void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const double* b, const int b_ld,
const double beta,
double* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -3649,8 +3657,8 @@ void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -3686,8 +3694,8 @@ void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -3725,8 +3733,8 @@ void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const float* b, const int b_ld,
const float beta,
float* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -3762,8 +3770,8 @@ void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const double* b, const int b_ld,
const double beta,
double* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -3799,8 +3807,8 @@ void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -3836,8 +3844,8 @@ void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -3875,8 +3883,8 @@ void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -3912,8 +3920,8 @@ void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -3950,8 +3958,8 @@ void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const float* a, const int a_ld,
const float beta,
float* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -3982,8 +3990,8 @@ void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const double* a, const int a_ld,
const double beta,
double* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4014,8 +4022,8 @@ void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const void* a, const int a_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -4046,8 +4054,8 @@ void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const void* a, const int a_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -4080,8 +4088,8 @@ void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const void* a, const int a_ld,
const float beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4112,8 +4120,8 @@ void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const void* a, const int a_ld,
const double beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4147,8 +4155,8 @@ void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co
const float* b, const int b_ld,
const float beta,
float* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4184,8 +4192,8 @@ void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co
const double* b, const int b_ld,
const double beta,
double* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4221,8 +4229,8 @@ void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -4258,8 +4266,8 @@ void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -4297,8 +4305,8 @@ void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co
const void* b, const int b_ld,
const float beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = beta;
@@ -4334,8 +4342,8 @@ void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co
const void* b, const int b_ld,
const double beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = beta;
@@ -4371,8 +4379,8 @@ void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const float alpha,
const float* a, const int a_ld,
float* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4402,8 +4410,8 @@ void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const double alpha,
const double* a, const int a_ld,
double* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4433,8 +4441,8 @@ void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4464,8 +4472,8 @@ void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4497,8 +4505,8 @@ void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const float alpha,
const float* a, const int a_ld,
float* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4528,8 +4536,8 @@ void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const double alpha,
const double* a, const int a_ld,
double* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4559,8 +4567,8 @@ void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4590,8 +4598,8 @@ void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4628,8 +4636,8 @@ void cblas_shad(const int n,
const float* y, const int y_inc,
const float beta,
float* z, const int z_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4661,8 +4669,8 @@ void cblas_dhad(const int n,
const double* y, const int y_inc,
const double beta,
double* z, const int z_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4694,8 +4702,8 @@ void cblas_chad(const int n,
const void* y, const int y_inc,
const void* beta,
void* z, const int z_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -4727,8 +4735,8 @@ void cblas_zhad(const int n,
const void* y, const int y_inc,
const void* beta,
void* z, const int z_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -4761,8 +4769,8 @@ void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp
const float alpha,
const float* a, const int a_ld,
float* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
@@ -4789,8 +4797,8 @@ void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp
const double alpha,
const double* a, const int a_ld,
double* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
@@ -4817,8 +4825,8 @@ void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
@@ -4845,8 +4853,8 @@ void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
@@ -4873,8 +4881,8 @@ void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp
void cblas_sim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const float* im,
float* col) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto im_size = height * width * channels;
const auto col_size = height * width * channels;
@@ -4895,8 +4903,8 @@ void cblas_sim2col(const int channels, const int height, const int width, const
void cblas_dim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const double* im,
double* col) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto im_size = height * width * channels;
const auto col_size = height * width * channels;
@@ -4917,8 +4925,8 @@ void cblas_dim2col(const int channels, const int height, const int width, const
void cblas_cim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const void* im,
void* col) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto im_size = height * width * channels;
const auto col_size = height * width * channels;
@@ -4939,8 +4947,8 @@ void cblas_cim2col(const int channels, const int height, const int width, const
void cblas_zim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const void* im,
void* col) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto im_size = height * width * channels;
const auto col_size = height * width * channels;
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 8ac0523f..94464990 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -447,8 +447,14 @@ class Program {
// Source-based constructor with memory management
explicit Program(const Context &context, const std::string &source) {
- const char *source_ptr = &source[0];
- const auto length = source.length();
+ #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+ const std::string source_null_kernel = source + "\n__kernel void null_kernel() {}\n";
+ const char *source_ptr = &source_null_kernel[0];
+ const auto length = source_null_kernel.length();
+ #else
+ const char *source_ptr = &source[0];
+ const auto length = source.length();
+ #endif
auto status = CL_SUCCESS;
program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
CLCudaAPIError::Check(status, "clCreateProgramWithSource");
@@ -723,9 +729,10 @@ class Buffer {
}
// Copies the contents of this buffer into another device buffer
- void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
+ void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination,
+ EventPointer event = nullptr) const {
CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0,
- nullptr, nullptr));
+ nullptr, event));
}
void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
CopyToAsync(queue, size, destination);
@@ -764,10 +771,21 @@ class Kernel {
kernel_(new cl_kernel, [](cl_kernel* k) {
if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
delete k;
- }) {
+ })
+ #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+ , null_kernel_(new cl_kernel, [](cl_kernel* k) {
+ if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
+ delete k;
+ })
+ #endif
+ {
auto status = CL_SUCCESS;
*kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status);
CLCudaAPIError::Check(status, "clCreateKernel");
+ #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+ *null_kernel_ = clCreateKernel(program->operator()(), "null_kernel", &status);
+ CLCudaAPIError::Check(status, "clCreateKernel");
+ #endif
}
// Sets a kernel argument at the indicated position
@@ -831,12 +849,21 @@ class Kernel {
static_cast<cl_uint>(waitForEventsPlain.size()),
!waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr,
event));
+ #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+ const std::vector<size_t> nullRange = {1};
+ CheckError(clEnqueueNDRangeKernel(queue(), *null_kernel_, static_cast<cl_uint>(nullRange.size()),
+ nullptr, nullRange.data(), nullptr,
+ 0, nullptr, nullptr));
+ #endif
}
// Accessor to the private data-member
const cl_kernel& operator()() const { return *kernel_; }
private:
std::shared_ptr<cl_kernel> kernel_;
+ #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+ std::shared_ptr<cl_kernel> null_kernel_;
+ #endif
// Internal implementation for the recursive SetArguments function.
template <typename T>
diff --git a/src/database/kernels/copy/copy_3232.hpp b/src/database/kernels/copy/copy_3232.hpp
index 64d56a7b..83ba8106 100644
--- a/src/database/kernels/copy/copy_3232.hpp
+++ b/src/database/kernels/copy/copy_3232.hpp
@@ -88,6 +88,7 @@ const DatabaseEntry CopyComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 16, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 32, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/gemm_routine/gemm_routine_32.hpp b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
index b685d4bc..ba0cc5a3 100644
--- a/src/database/kernels/gemm_routine/gemm_routine_32.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
@@ -33,6 +33,7 @@ const DatabaseEntry GemmRoutineSingle = {
kDeviceTypeGPU, "Intel", {
{ "default", {
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
@@ -62,7 +63,7 @@ const DatabaseEntry GemmRoutineSingle = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
index c72db083..9977bb78 100644
--- a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
@@ -24,6 +24,7 @@ const DatabaseEntry GemmRoutineComplexSingle = {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "default", {
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
@@ -49,7 +50,7 @@ const DatabaseEntry GemmRoutineComplexSingle = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/invert/invert_32.hpp b/src/database/kernels/invert/invert_32.hpp
index b3f9143a..d550e3ba 100644
--- a/src/database/kernels/invert/invert_32.hpp
+++ b/src/database/kernels/invert/invert_32.hpp
@@ -24,6 +24,7 @@ const DatabaseEntry InvertSingle = {
kDeviceTypeGPU, "Intel", {
{ "default", {
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/invert/invert_3232.hpp b/src/database/kernels/invert/invert_3232.hpp
index 11ea895d..d1103909 100644
--- a/src/database/kernels/invert/invert_3232.hpp
+++ b/src/database/kernels/invert/invert_3232.hpp
@@ -23,6 +23,7 @@ const DatabaseEntry InvertComplexSingle = {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "default", {
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/pad/pad_3232.hpp b/src/database/kernels/pad/pad_3232.hpp
index 88ae08a3..08ed21a4 100644
--- a/src/database/kernels/pad/pad_3232.hpp
+++ b/src/database/kernels/pad/pad_3232.hpp
@@ -88,6 +88,7 @@ const DatabaseEntry PadComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 32, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/padtranspose/padtranspose_3232.hpp b/src/database/kernels/padtranspose/padtranspose_3232.hpp
index fb0ec5d0..32506c1e 100644
--- a/src/database/kernels/padtranspose/padtranspose_3232.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_3232.hpp
@@ -88,10 +88,11 @@ const DatabaseEntry PadtransposeComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/transpose/transpose_3232.hpp b/src/database/kernels/transpose/transpose_3232.hpp
index 45b2c3ff..b0de29f1 100644
--- a/src/database/kernels/transpose/transpose_3232.hpp
+++ b/src/database/kernels/transpose/transpose_3232.hpp
@@ -88,6 +88,7 @@ const DatabaseEntry TransposeComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/trsv_routine/trsv_routine_32.hpp b/src/database/kernels/trsv_routine/trsv_routine_32.hpp
index 2ee82b71..7fc71f5e 100644
--- a/src/database/kernels/trsv_routine/trsv_routine_32.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_32.hpp
@@ -24,6 +24,7 @@ const DatabaseEntry TrsvRoutineSingle = {
kDeviceTypeGPU, "Intel", {
{ "default", {
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
index 6f2f9306..221f12cf 100644
--- a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
@@ -23,6 +23,7 @@ const DatabaseEntry TrsvRoutineComplexSingle = {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "default", {
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xaxpy/xaxpy_3232.hpp b/src/database/kernels/xaxpy/xaxpy_3232.hpp
index 4a29da4d..25cd3630 100644
--- a/src/database/kernels/xaxpy/xaxpy_3232.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_3232.hpp
@@ -88,6 +88,7 @@ const DatabaseEntry XaxpyComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xdot/xdot_32.hpp b/src/database/kernels/xdot/xdot_32.hpp
index a7e685bf..2b7bc724 100644
--- a/src/database/kernels/xdot/xdot_32.hpp
+++ b/src/database/kernels/xdot/xdot_32.hpp
@@ -84,7 +84,7 @@ const DatabaseEntry XdotSingle = {
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 512, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xdot/xdot_3232.hpp b/src/database/kernels/xdot/xdot_3232.hpp
index ad2cf414..e655e17e 100644
--- a/src/database/kernels/xdot/xdot_3232.hpp
+++ b/src/database/kernels/xdot/xdot_3232.hpp
@@ -82,6 +82,7 @@ const DatabaseEntry XdotComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemm/xgemm_32.hpp b/src/database/kernels/xgemm/xgemm_32.hpp
index 32358dbc..d75758ea 100644
--- a/src/database/kernels/xgemm/xgemm_32.hpp
+++ b/src/database/kernels/xgemm/xgemm_32.hpp
@@ -90,7 +90,7 @@ const DatabaseEntry XgemmSingle = {
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 1, 4, 1, 1, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 0, 1, 32, 2, 32, 8, 64, 16, 16, 128, 0, 0, 0, 1, 1, 2 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 16, 128, 1, 1, 1, 1, 2, 4 } },
- { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 16, 128, 1, 1, 0, 1, 1, 4 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 2, 1, 1, 4, 4, 32, 8, 8, 64, 0, 0, 0, 0, 2, 2 } },
{ Name{"Iris "}, Params{ 0, 1, 16, 8, 16, 8, 128, 32, 16, 64, 1, 1, 1, 1, 4, 1 } },
{ Name{"Iris Pro "}, Params{ 0, 1, 16, 2, 16, 8, 64, 32, 32, 128, 1, 1, 1, 0, 4, 4 } },
diff --git a/src/database/kernels/xgemm/xgemm_3232.hpp b/src/database/kernels/xgemm/xgemm_3232.hpp
index 22959347..9c0b70b1 100644
--- a/src/database/kernels/xgemm/xgemm_3232.hpp
+++ b/src/database/kernels/xgemm/xgemm_3232.hpp
@@ -88,10 +88,11 @@ const DatabaseEntry XgemmComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 1, 16, 8, 8, 8, 32, 16, 16, 64, 1, 0, 0, 0, 4, 4 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 8, 32, 0, 0, 1, 0, 1, 1 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 0, 1, 32, 8, 16, 16, 64, 16, 16, 64, 1, 1, 1, 1, 2, 1 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 4, 1, 1, 32, 32, 128, 16, 16, 128, 0, 0, 0, 0, 4, 1 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 2, 1, 1, 4, 4, 16, 8, 8, 64, 0, 0, 0, 0, 2, 2 } },
{ Name{"Iris "}, Params{ 0, 1, 32, 8, 32, 16, 64, 8, 16, 64, 1, 0, 1, 0, 1, 1 } },
{ Name{"Iris Pro "}, Params{ 0, 1, 16, 2, 8, 8, 32, 32, 8, 32, 1, 1, 1, 1, 1, 1 } },
- { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 32, 32, 128, 1, 1, 1, 0, 2, 2 } },
+ { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 8, 32, 8, 8, 32, 0, 0, 1, 0, 1, 1 } },
} },
}
},
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
index f6ea9523..146018d5 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
@@ -69,7 +69,7 @@ const DatabaseEntry XgemmDirectSingle = {
{ "default", {
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } },
- { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 16, 8, 16, 16, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 16, 32, 16, 8, 1, 0, 1, 1, 64, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
index 8f24ee7d..6dd95b38 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
@@ -63,9 +63,10 @@ const DatabaseEntry XgemmDirectComplexSingle = {
kDeviceTypeGPU, "Intel", {
{ "default", {
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemv/xgemv_3232.hpp b/src/database/kernels/xgemv/xgemv_3232.hpp
index dc9a0a88..2b4328b3 100644
--- a/src/database/kernels/xgemv/xgemv_3232.hpp
+++ b/src/database/kernels/xgemv/xgemv_3232.hpp
@@ -86,6 +86,7 @@ const DatabaseEntry XgemvComplexSingle = {
{ Name{"Intel(R) HD Graphics 530 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
index 146bd466..351973ae 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
@@ -92,7 +92,7 @@ const DatabaseEntry XgemvFastSingle = {
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
index 693fac4e..e9928b28 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
@@ -86,6 +86,7 @@ const DatabaseEntry XgemvFastComplexSingle = {
{ Name{"Intel(R) HD Graphics 530 "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
index 42e7a36d..cf1b4e55 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
@@ -70,7 +70,7 @@ const DatabaseEntry XgemvFastRotSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 128, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
index 98d5cf6a..bf780835 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
@@ -68,6 +68,7 @@ const DatabaseEntry XgemvFastRotComplexSingle = {
{ "default", {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 4, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xger/xger_3232.hpp b/src/database/kernels/xger/xger_3232.hpp
index 123fc4fa..763f2ca6 100644
--- a/src/database/kernels/xger/xger_3232.hpp
+++ b/src/database/kernels/xger/xger_3232.hpp
@@ -87,6 +87,7 @@ const DatabaseEntry XgerComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/kernel_preprocessor.cpp b/src/kernel_preprocessor.cpp
index aa946bab..1c422d33 100644
--- a/src/kernel_preprocessor.cpp
+++ b/src/kernel_preprocessor.cpp
@@ -557,6 +557,8 @@ std::string PreprocessKernelSource(const std::string& kernel_source) {
lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers);
lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false);
lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false);
+ lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false);
+ lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false);
lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, true);
// Gather the results
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 3cfc5dfb..80a60107 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -43,8 +43,6 @@
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
-// =================================================================================================
-
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef GEMMK
@@ -397,9 +395,7 @@ INLINE_FUNC realN LocalToPrivateB(LOCAL_PTR realN* blm, const int _ni, const int
}
#endif
-// =================================================================================================
-
-// End of the C++11 raw string literal
)"
+// End of the C++11 raw string literal
// =================================================================================================
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 17c8955a..ee4d5da5 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -15,8 +15,6 @@
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
-// =================================================================================================
-
// The vectorised multiply-add function
INLINE_FUNC realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
#if USE_VECTOR_MAD == 1
@@ -171,9 +169,7 @@ INLINE_FUNC void StoreResults(__global realM* cgm, realM c_value, const int _mi,
cgm[index] = result;
}
-// =================================================================================================
-
-// End of the C++11 raw string literal
)"
+// End of the C++11 raw string literal
// =================================================================================================
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index 90de0b3b..77964a94 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -15,14 +15,12 @@
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
-// =================================================================================================
-
// A common interface for subgroup functions
#if USE_SUBGROUP_SHUFFLING == 1
INLINE_FUNC int clblast_get_sub_group_local_id() {
-
+
// Intel extension
#if SUBGROUP_SHUFFLING_INTEL == 1
return get_sub_group_local_id();
@@ -36,7 +34,7 @@ INLINE_FUNC int clblast_get_sub_group_local_id() {
}
INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
-
+
// Intel extension
#if SUBGROUP_SHUFFLING_INTEL == 1
return intel_sub_group_shuffle(reg, src);
@@ -238,48 +236,47 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
#pragma unroll
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
- const int index = _ni * (MWI/VWM) + _mi;
#if USE_SUBGROUP_SHUFFLING == 1
const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni);
#else
const realN aval = apm[_ni * (KREG/VWN) + _ki];
#endif
#if VWN == 1
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval);
#elif VWN == 2
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y);
#elif VWN == 4
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.z);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.w);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.z);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.w);
#elif VWN == 8
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.s0);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.s1);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.s2);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.s3);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 4) * (MWI/VWM) + _mi], aval.s4);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 5) * (MWI/VWM) + _mi], aval.s5);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 6) * (MWI/VWM) + _mi], aval.s6);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 7) * (MWI/VWM) + _mi], aval.s7);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.s0);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.s1);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.s2);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.s3);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 4) * (MWI/VWM) + _mi], aval.s4);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 5) * (MWI/VWM) + _mi], aval.s5);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 6) * (MWI/VWM) + _mi], aval.s6);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 7) * (MWI/VWM) + _mi], aval.s7);
#elif VWN == 16
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0 ) * (MWI/VWM) + _mi], aval.s0);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1 ) * (MWI/VWM) + _mi], aval.s1);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 2 ) * (MWI/VWM) + _mi], aval.s2);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 3 ) * (MWI/VWM) + _mi], aval.s3);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 4 ) * (MWI/VWM) + _mi], aval.s4);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 5 ) * (MWI/VWM) + _mi], aval.s5);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 6 ) * (MWI/VWM) + _mi], aval.s6);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 7 ) * (MWI/VWM) + _mi], aval.s7);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 8 ) * (MWI/VWM) + _mi], aval.s8);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 9 ) * (MWI/VWM) + _mi], aval.s9);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 10) * (MWI/VWM) + _mi], aval.sA);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 11) * (MWI/VWM) + _mi], aval.sB);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 12) * (MWI/VWM) + _mi], aval.sC);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 13) * (MWI/VWM) + _mi], aval.sD);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 14) * (MWI/VWM) + _mi], aval.sE);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 15) * (MWI/VWM) + _mi], aval.sF);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0 ) * (MWI/VWM) + _mi], aval.s0);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1 ) * (MWI/VWM) + _mi], aval.s1);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2 ) * (MWI/VWM) + _mi], aval.s2);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3 ) * (MWI/VWM) + _mi], aval.s3);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 4 ) * (MWI/VWM) + _mi], aval.s4);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 5 ) * (MWI/VWM) + _mi], aval.s5);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 6 ) * (MWI/VWM) + _mi], aval.s6);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 7 ) * (MWI/VWM) + _mi], aval.s7);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 8 ) * (MWI/VWM) + _mi], aval.s8);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 9 ) * (MWI/VWM) + _mi], aval.s9);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 10) * (MWI/VWM) + _mi], aval.sA);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 11) * (MWI/VWM) + _mi], aval.sB);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 12) * (MWI/VWM) + _mi], aval.sC);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 13) * (MWI/VWM) + _mi], aval.sD);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 14) * (MWI/VWM) + _mi], aval.sE);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 15) * (MWI/VWM) + _mi], aval.sF);
#endif
}
}
@@ -311,9 +308,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
}
}
-// =================================================================================================
-
-// End of the C++11 raw string literal
)"
+// End of the C++11 raw string literal
// =================================================================================================
diff --git a/src/kernels/level3/xgemm_part4.opencl b/src/kernels/level3/xgemm_part4.opencl
index e581cd84..b1f1ade6 100644
--- a/src/kernels/level3/xgemm_part4.opencl
+++ b/src/kernels/level3/xgemm_part4.opencl
@@ -15,7 +15,6 @@
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
-// =================================================================================================
// The upper-triangular and lower-triangular kernels are only used in special cases
#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
@@ -132,9 +131,8 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
}
#endif
-// =================================================================================================
-// End of the C++11 raw string literal
)"
+// End of the C++11 raw string literal
// =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index c30a2e0e..c6db0152 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -76,6 +76,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
// Determines the right kernel
auto kernel_name = std::string{};
+ auto pad_kernel = false;
if (do_transpose) {
if (use_fast_kernel &&
IsMultiple(src_ld, db["TRA_WPT"]) &&
@@ -85,7 +86,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
else {
use_fast_kernel = false;
- kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
+ pad_kernel = (do_pad || do_conjugate);
+ kernel_name = (pad_kernel) ? "TransposePadMatrix" : "TransposeMatrix";
}
}
else {
@@ -97,7 +99,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
else {
use_fast_kernel = false;
- kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
+ pad_kernel = do_pad;
+ kernel_name = (pad_kernel) ? "CopyPadMatrix" : "CopyMatrix";
}
}
@@ -123,7 +126,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, GetRealArg(alpha));
- if (do_pad) {
+ if (pad_kernel) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
index 76401753..2a5a5664 100644
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@@ -33,7 +33,8 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ EventPointer event) {
if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); };
@@ -69,9 +70,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
// Launches the kernel
const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
const auto global = std::vector<size_t>{Ceil(n, db_["TRSV_BLOCK_SIZE"])};
- auto event = Event();
- RunKernel(kernel, queue_, device_, global, local, event.pointer());
- event.WaitForCompletion();
+ RunKernel(kernel, queue_, device_, global, local, event);
}
// =================================================================================================
@@ -146,14 +145,16 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
}
// Runs the triangular substitution for the block size
+ auto sub_event = Event();
Substitution(layout, triangle, a_transpose, diagonal, block_size,
a_buffer, a_offset + col + col*a_ld, a_ld,
b_buffer, b_offset + col*b_inc, b_inc,
- x_buffer, x_offset + col*x_inc, x_inc);
+ x_buffer, x_offset + col*x_inc, x_inc, sub_event.pointer());
+ sub_event.WaitForCompletion();
}
// Retrieves the results
- x_buffer.CopyTo(queue_, x_size, b_buffer);
+ x_buffer.CopyToAsync(queue_, x_size, b_buffer, event_);
}
// =================================================================================================
diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp
index 67e626a1..8a900a35 100644
--- a/src/routines/level2/xtrsv.hpp
+++ b/src/routines/level2/xtrsv.hpp
@@ -32,6 +32,7 @@ class Xtrsv: public Xgemv<T> {
using Xgemv<T>::device_;
using Xgemv<T>::db_;
using Xgemv<T>::program_;
+ using Xgemv<T>::event_;
using Xgemv<T>::DoGemv;
// Constructor
@@ -50,7 +51,8 @@ class Xtrsv: public Xgemv<T> {
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
- const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc);
+ const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc,
+ EventPointer event);
};
// =================================================================================================
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index fd5a20db..cb24460a 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -40,6 +40,7 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ , // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part3.opencl"
#include "../../kernels/level3/xgemm_part4.opencl"
}) {
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index 6912d3a9..2e6f30ec 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -32,6 +32,7 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ , // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part3.opencl"
#include "../../kernels/level3/xgemm_part4.opencl"
}) {
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index 6bb2a24f..5ffdc028 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -32,6 +32,7 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ , // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part3.opencl"
#include "../../kernels/level3/xgemm_part4.opencl"
}) {
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
index 905660ff..fe5d1e14 100644
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@@ -246,7 +246,7 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
}
// Retrieves the results
- x_buffer.CopyTo(queue_, b_size, b_buffer);
+ x_buffer.CopyToAsync(queue_, b_size, b_buffer, event_);
}
// =================================================================================================
diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp
index 5b42398e..871d7253 100644
--- a/src/routines/level3/xtrsm.hpp
+++ b/src/routines/level3/xtrsm.hpp
@@ -31,6 +31,7 @@ class Xtrsm: public Xgemm<T> {
using Xgemm<T>::device_;
using Xgemm<T>::db_;
using Xgemm<T>::program_;
+ using Xgemm<T>::event_;
using Xgemm<T>::DoGemm;
// Constructor
diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp
index 2bbc5007..b12b8734 100644
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@@ -38,6 +38,7 @@ XgemmBatched<T>::XgemmBatched(Queue &queue, EventPointer event, const std::strin
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ , // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part3.opencl"
#include "../../kernels/level3/xgemm_part4.opencl"
, // separated in multiple parts to prevent C1091 in MSVC 2013
diff --git a/src/routines/levelx/xgemmstridedbatched.cpp b/src/routines/levelx/xgemmstridedbatched.cpp
index 30c161cc..d9e3ebba 100644
--- a/src/routines/levelx/xgemmstridedbatched.cpp
+++ b/src/routines/levelx/xgemmstridedbatched.cpp
@@ -37,6 +37,7 @@ XgemmStridedBatched<T>::XgemmStridedBatched(Queue &queue, EventPointer event, co
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ , // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part3.opencl"
#include "../../kernels/level3/xgemm_part4.opencl"
, // separated in multiple parts to prevent C1091 in MSVC 2013
diff --git a/src/tuning/configurations.cpp b/src/tuning/configurations.cpp
index 1fe232cf..82d7e3b4 100644
--- a/src/tuning/configurations.cpp
+++ b/src/tuning/configurations.cpp
@@ -23,28 +23,42 @@ namespace clblast {
// Finds all configurations. It also applies the user-defined constraints within.
std::vector<Configuration> SetConfigurations(const Device& device,
const std::vector<Parameter> parameters,
+ const std::vector<size_t>& local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
const Constraints& constraints,
const LocalMemSizeInfo& local_mem_size_info) {
const auto local_mem_max = device.LocalMemSize();
+ const auto max_work_item_sizes = device.MaxWorkItemSizes();
+ const auto max_work_group_size = device.MaxWorkGroupSize();
auto config = Configuration();
auto configurations = std::vector<Configuration>();
- PopulateConfigurations(parameters, 0, config, configurations,
- local_mem_max, constraints, local_mem_size_info);
+ PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config,
+ 0, config, configurations,
+ local_mem_max, constraints, local_mem_size_info,
+ max_work_item_sizes, max_work_group_size);
return configurations;
}
// Iterates recursively over all permutations of the user-defined parameters
void PopulateConfigurations(const std::vector<Parameter> &parameters,
+ const std::vector<size_t> local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
const size_t index, const Configuration &config,
std::vector<Configuration> &configuration,
const size_t local_mem_max,
const Constraints& constraints,
- const LocalMemSizeInfo& local_mem_size_info) {
+ const LocalMemSizeInfo& local_mem_size_info,
+ const std::vector<size_t>& max_work_item_sizes,
+ const size_t max_work_group_size) {
// End of the chain: all parameters are considered, store the resulting configuration if it is a
// valid one according to the constraints
if (index == parameters.size()) {
- if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info)) {
+ if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info,
+ local_size_base, mul_local_config, div_local_config,
+ max_work_item_sizes, max_work_group_size)) {
configuration.push_back(config);
}
return;
@@ -55,8 +69,10 @@ void PopulateConfigurations(const std::vector<Parameter> &parameters,
for (auto &value: parameter.second) {
auto config_copy = config;
config_copy[parameter.first] = value;
- PopulateConfigurations(parameters, index+1, config_copy, configuration,
- local_mem_max, constraints, local_mem_size_info);
+ PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config,
+ index+1, config_copy, configuration,
+ local_mem_max, constraints, local_mem_size_info,
+ max_work_item_sizes, max_work_group_size);
}
}
@@ -64,7 +80,12 @@ void PopulateConfigurations(const std::vector<Parameter> &parameters,
bool ValidConfiguration(const Configuration &config,
const size_t local_mem_max,
const Constraints& constraints,
- const LocalMemSizeInfo& local_mem_size_info) {
+ const LocalMemSizeInfo& local_mem_size_info,
+ const std::vector<size_t> local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
+ const std::vector<size_t>& max_work_item_sizes,
+ const size_t max_work_group_size) {
// Iterates over all constraints
for (auto &constraint: constraints) {
@@ -92,6 +113,20 @@ bool ValidConfiguration(const Configuration &config,
return false;
}
+ // Checks the local thread size (both per dimension and in total)
+ const auto local = SetThreadConfiguration(config, local_size_base,
+ mul_local_config, div_local_config);
+ for (auto i=size_t{0}; i<local.size(); ++i) {
+ if (local[i] > max_work_item_sizes[i]) {
+ return false;
+ }
+ }
+ auto local_size = size_t{1};
+ for (auto &item: local) { local_size *= item; }
+ if (local_size > max_work_group_size) {
+ return false;
+ }
+
// Everything was OK: this configuration is valid
return true;
}
diff --git a/src/tuning/configurations.hpp b/src/tuning/configurations.hpp
index faa5498f..4b9ba93f 100644
--- a/src/tuning/configurations.hpp
+++ b/src/tuning/configurations.hpp
@@ -50,6 +50,9 @@ struct LocalMemSizeInfo {
// function to find all configurations. It also applies the user-defined constraints within.
std::vector<Configuration> SetConfigurations(const Device& device,
const std::vector<Parameter> parameters,
+ const std::vector<size_t>& local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
const Constraints& constraints,
const LocalMemSizeInfo& local_mem_size_info);
@@ -58,11 +61,16 @@ std::vector<Configuration> SetConfigurations(const Device& device,
// At the end of each chain (when all parameters are considered), the function stores the result
// into the configuration list.
void PopulateConfigurations(const std::vector<Parameter> &parameters,
+ const std::vector<size_t> local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
const size_t index, const Configuration &config,
std::vector<Configuration> &configuration,
const size_t local_mem_max,
const Constraints& constraints,
- const LocalMemSizeInfo& local_mem_size_info);
+ const LocalMemSizeInfo& local_mem_size_info,
+ const std::vector<size_t>& max_work_item_sizes,
+ const size_t max_work_group_size);
// Loops over all user-defined constraints to check whether or not the configuration is valid.
// Assumes initially all configurations are valid, then returns false if one of the constraints has
@@ -71,7 +79,12 @@ void PopulateConfigurations(const std::vector<Parameter> &parameters,
bool ValidConfiguration(const Configuration &config,
const size_t local_mem_max,
const Constraints& constraints,
- const LocalMemSizeInfo& local_mem_size_info);
+ const LocalMemSizeInfo& local_mem_size_info,
+ const std::vector<size_t> local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
+ const std::vector<size_t>& max_work_item_sizes,
+ const size_t max_work_group_size);
// Processes multipliers and dividers to obtain the final thread configuration
std::vector<size_t> SetThreadConfiguration(const Configuration& config,
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 75e776e6..dd907ba4 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -33,9 +33,13 @@ void StartVariation(int argc, char *argv[]) {
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
+ printf("* (1/4) Tuning main GEMM kernel (GEMMK == 0) for fixed set of parameters\n\n");
StartVariation<1>(argc, argv);
+ printf("* (2/4) Tuning main GEMM kernel (GEMMK == 0) for random parameters out of larger set\n\n");
StartVariation<2>(argc, argv);
+ printf("* (3/4) Tuning secondary GEMM kernel (GEMMK == 1) for fixed set of parameters\n\n");
StartVariation<11>(argc, argv);
+ printf("* (4/4) Tuning secondary GEMM kernel (GEMMK == 1) for random parameters out of larger set\n\n");
StartVariation<12>(argc, argv);
return 0;
}
diff --git a/src/tuning/kernels/xgemm.hpp b/src/tuning/kernels/xgemm.hpp
index 9a538c1b..fa1bb6ec 100644
--- a/src/tuning/kernels/xgemm.hpp
+++ b/src/tuning/kernels/xgemm.hpp
@@ -50,6 +50,8 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments<T> &args) {
settings.sources +=
#include "../src/kernels/level3/xgemm_part1.opencl"
#include "../src/kernels/level3/xgemm_part2.opencl"
+ ;
+ settings.sources +=
#include "../src/kernels/level3/xgemm_part3.opencl"
#include "../src/kernels/level3/xgemm_part4.opencl"
;
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index 822f8851..d382fb18 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -172,7 +172,8 @@ void Tuner(int argc, char* argv[], const int V,
}
// Sets the tunable parameters and their possible values
- auto configurations = SetConfigurations(device, settings.parameters,
+ auto configurations = SetConfigurations(device, settings.parameters, settings.local_size,
+ settings.mul_local, settings.div_local,
SetConstraints(V), ComputeLocalMemSize(V));
printf("* Found %s%zu configuration(s)%s\n",
kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str());
diff --git a/src/tuning/tuning_api.cpp b/src/tuning/tuning_api.cpp
index 2eec2e2e..2cc9b786 100644
--- a/src/tuning/tuning_api.cpp
+++ b/src/tuning/tuning_api.cpp
@@ -264,7 +264,8 @@ StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V,
}
// Sets the tunable parameters and their possible values
- auto configurations = SetConfigurations(device, settings.parameters,
+ auto configurations = SetConfigurations(device, settings.parameters, settings.local_size,
+ settings.mul_local, settings.div_local,
SetConstraints(V), ComputeLocalMemSize(V));
// Select the search method (full search or a random fraction)
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index 835f54b4..00cb90cb 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -59,7 +59,8 @@ std::shared_ptr<Program> CompileFromSource(
}
// For Intel GPUs with subgroup support, use subgroup shuffling.
- if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) {
+ if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups) &&
+ (precision == Precision::kSingle || precision == Precision::kHalf)) {
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n";
}
diff --git a/test/correctness/misc/preprocessor.cpp b/test/correctness/misc/preprocessor.cpp
index c5d115d3..4d6fa114 100644
--- a/test/correctness/misc/preprocessor.cpp
+++ b/test/correctness/misc/preprocessor.cpp
@@ -221,7 +221,7 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio
;
if (TestKernel(device, context, "TransposePadMatrix", transpose_pad_sources, precision)) { passed++; } else { errors++; }
- // GEMM (in-direct)
+ // GEMM (in-direct) GEMMK==0
const auto gemm_sources =
"#define KWI 2\n"
"#define MWG 16\n"
@@ -234,6 +234,18 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio
;
if (TestKernel(device, context, "Xgemm", gemm_sources, precision)) { passed++; } else { errors++; }
+ // GEMM (in-direct) GEMMK==1
+ const auto gemm_sources_gemmk1 =
+ "#define MWG 16\n"
+ "#define NWG 16\n"
+ "#define GEMMK 1\n"
+ #include "../src/kernels/level3/xgemm_part1.opencl"
+ #include "../src/kernels/level3/xgemm_part2.opencl"
+ #include "../src/kernels/level3/xgemm_part3.opencl"
+ #include "../src/kernels/level3/xgemm_part4.opencl"
+ ;
+ if (TestKernel(device, context, "Xgemm", gemm_sources_gemmk1, precision)) { passed++; } else { errors++; }
+
// GEMM (direct)
const auto gemm_direct_sources =
"#define KWID 2\n"
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index 3c92565e..d28aba40 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -239,7 +239,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
}
// Tests the error count (should be zero)
- TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
+ TestErrorCount(errors, get_id1_(args)*get_id2_(args) + kCanarySize, args);
}
TestEnd();
}
diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp
index ea35dbe2..4a93b29d 100644
--- a/test/routines/levelx/xomatcopy.hpp
+++ b/test/routines/levelx/xomatcopy.hpp
@@ -45,7 +45,9 @@ StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host)
const auto b_two = (b_rotated) ? id1 : id2;
const auto a_index = a_two * args.a_ld + a_one + args.a_offset;
const auto b_index = b_two * args.b_ld + b_one + args.b_offset;
- buffers_host.b_mat[b_index] = args.alpha * buffers_host.a_mat[a_index];
+ auto a_value = buffers_host.a_mat[a_index];
+ if (args.a_transpose == Transpose::kConjugate) { a_value = ComplexConjugate(a_value); }
+ buffers_host.b_mat[b_index] = args.alpha * a_value;
}
}
return StatusCode::kSuccess;
diff --git a/test/test_utilities.cpp b/test/test_utilities.cpp
index 59ec949d..c43200b9 100644
--- a/test/test_utilities.cpp
+++ b/test/test_utilities.cpp
@@ -31,6 +31,16 @@ template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value
// =================================================================================================
+// Performs a complex conjugate if complex
+template <typename T> T ComplexConjugate(const T value) { return value; }
+template half ComplexConjugate(const half);
+template float ComplexConjugate(const float);
+template double ComplexConjugate(const double);
+template <> float2 ComplexConjugate(const float2 value) { return float2{value.real(), -value.imag()}; }
+template <> double2 ComplexConjugate(const double2 value) { return double2{value.real(), -value.imag()}; }
+
+// =================================================================================================
+
template <typename T, typename U>
void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
Queue &queue, const std::vector<std::string> &names) {
diff --git a/test/test_utilities.hpp b/test/test_utilities.hpp
index 42660bdb..7bf5e65f 100644
--- a/test/test_utilities.hpp
+++ b/test/test_utilities.hpp
@@ -70,6 +70,10 @@ struct BuffersHost {
// =================================================================================================
+template <typename T> T ComplexConjugate(const T value);
+
+// =================================================================================================
+
// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
// data-types such as the Layout and Transpose data-types.
template <typename T>