summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-09-16 20:01:18 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2018-09-16 20:01:18 +0200
commit83ba3d4b7ba3a9cb5fbd2c1ad2bb14b2addd39fb (patch)
tree58900a63158d08e76342b46372fcc59015b4d3ca /src
parentb7d833901213d03fe5e7f10c15741f55c6c1eb54 (diff)
parentc163868e1822a97750b4380f0d9cdd38369f9f0b (diff)
Merge branch 'master' into convgemm_multi_kernel
Diffstat (limited to 'src')
-rw-r--r--src/clblast_netlib_c.cpp672
-rw-r--r--src/clpp11.hpp37
-rw-r--r--src/database/kernels/copy/copy_3232.hpp1
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_32.hpp3
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_3232.hpp3
-rw-r--r--src/database/kernels/invert/invert_32.hpp1
-rw-r--r--src/database/kernels/invert/invert_3232.hpp1
-rw-r--r--src/database/kernels/pad/pad_3232.hpp1
-rw-r--r--src/database/kernels/padtranspose/padtranspose_3232.hpp3
-rw-r--r--src/database/kernels/transpose/transpose_3232.hpp1
-rw-r--r--src/database/kernels/trsv_routine/trsv_routine_32.hpp1
-rw-r--r--src/database/kernels/trsv_routine/trsv_routine_3232.hpp1
-rw-r--r--src/database/kernels/xaxpy/xaxpy_3232.hpp1
-rw-r--r--src/database/kernels/xdot/xdot_32.hpp2
-rw-r--r--src/database/kernels/xdot/xdot_3232.hpp1
-rw-r--r--src/database/kernels/xgemm/xgemm_32.hpp2
-rw-r--r--src/database/kernels/xgemm/xgemm_3232.hpp3
-rw-r--r--src/database/kernels/xgemm_direct/xgemm_direct_32.hpp2
-rw-r--r--src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp3
-rw-r--r--src/database/kernels/xgemv/xgemv_3232.hpp1
-rw-r--r--src/database/kernels/xgemv_fast/xgemv_fast_32.hpp2
-rw-r--r--src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp1
-rw-r--r--src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp2
-rw-r--r--src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp1
-rw-r--r--src/database/kernels/xger/xger_3232.hpp1
-rw-r--r--src/kernel_preprocessor.cpp2
-rw-r--r--src/kernels/level3/xgemm_part1.opencl6
-rw-r--r--src/kernels/level3/xgemm_part2.opencl6
-rw-r--r--src/kernels/level3/xgemm_part3.opencl73
-rw-r--r--src/kernels/level3/xgemm_part4.opencl4
-rw-r--r--src/routines/common.hpp9
-rw-r--r--src/routines/level2/xtrsv.cpp13
-rw-r--r--src/routines/level2/xtrsv.hpp4
-rw-r--r--src/routines/level3/xgemm.cpp1
-rw-r--r--src/routines/level3/xherk.cpp1
-rw-r--r--src/routines/level3/xsyrk.cpp1
-rw-r--r--src/routines/level3/xtrsm.cpp2
-rw-r--r--src/routines/level3/xtrsm.hpp1
-rw-r--r--src/routines/levelx/xgemmbatched.cpp1
-rw-r--r--src/routines/levelx/xgemmstridedbatched.cpp1
-rw-r--r--src/tuning/configurations.cpp49
-rw-r--r--src/tuning/configurations.hpp17
-rw-r--r--src/tuning/kernels/xgemm.cpp4
-rw-r--r--src/tuning/kernels/xgemm.hpp2
-rw-r--r--src/tuning/tuning.cpp3
-rw-r--r--src/tuning/tuning_api.cpp3
-rw-r--r--src/utilities/compile.cpp3
47 files changed, 531 insertions, 422 deletions
diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp
index 9ab663be..dbc2ba57 100644
--- a/src/clblast_netlib_c.cpp
+++ b/src/clblast_netlib_c.cpp
@@ -23,6 +23,14 @@
using float2 = clblast::float2;
using double2 = clblast::double2;
+// Option to make OpenCL device and context static to avoid re-creation upon multiple calls to the
+// Netlib API. Disadvantage is that they are not cleaned-up until program termination.
+#ifdef NETLIB_PERSISTENT_OPENCL
+ #define OPTIONAL_STATIC static
+#else
+ #define OPTIONAL_STATIC
+#endif
+
// Helper function to get a default OpenCL platform and device
clblast::Device get_device() {
auto platform_id = clblast::ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0});
@@ -40,8 +48,8 @@ void cblas_srotg(float* sa,
float* sb,
float* sc,
float* ss) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto sa_size = 1;
const auto sb_size = 1;
@@ -73,8 +81,8 @@ void cblas_drotg(double* sa,
double* sb,
double* sc,
double* ss) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto sa_size = 1;
const auto sb_size = 1;
@@ -109,8 +117,8 @@ void cblas_srotmg(float* sd1,
float* sx1,
const float sy1,
float* sparam) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto sy1_size = 1;
const auto sd1_size = 1;
@@ -148,8 +156,8 @@ void cblas_drotmg(double* sd1,
double* sx1,
const double sy1,
double* sparam) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto sy1_size = 1;
const auto sd1_size = 1;
@@ -189,8 +197,8 @@ void cblas_srot(const int n,
float* y, const int y_inc,
const float cos,
const float sin) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -216,8 +224,8 @@ void cblas_drot(const int n,
double* y, const int y_inc,
const double cos,
const double sin) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -244,8 +252,8 @@ void cblas_srotm(const int n,
float* x, const int x_inc,
float* y, const int y_inc,
float* sparam) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -273,8 +281,8 @@ void cblas_drotm(const int n,
double* x, const int x_inc,
double* y, const int y_inc,
double* sparam) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -303,8 +311,8 @@ void cblas_drotm(const int n,
void cblas_sswap(const int n,
float* x, const int x_inc,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -326,8 +334,8 @@ void cblas_sswap(const int n,
void cblas_dswap(const int n,
double* x, const int x_inc,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -349,8 +357,8 @@ void cblas_dswap(const int n,
void cblas_cswap(const int n,
void* x, const int x_inc,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -372,8 +380,8 @@ void cblas_cswap(const int n,
void cblas_zswap(const int n,
void* x, const int x_inc,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -397,8 +405,8 @@ void cblas_zswap(const int n,
void cblas_sscal(const int n,
const float alpha,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -417,8 +425,8 @@ void cblas_sscal(const int n,
void cblas_dscal(const int n,
const double alpha,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -437,8 +445,8 @@ void cblas_dscal(const int n,
void cblas_cscal(const int n,
const void* alpha,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -457,8 +465,8 @@ void cblas_cscal(const int n,
void cblas_zscal(const int n,
const void* alpha,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -479,8 +487,8 @@ void cblas_zscal(const int n,
void cblas_scopy(const int n,
const float* x, const int x_inc,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -501,8 +509,8 @@ void cblas_scopy(const int n,
void cblas_dcopy(const int n,
const double* x, const int x_inc,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -523,8 +531,8 @@ void cblas_dcopy(const int n,
void cblas_ccopy(const int n,
const void* x, const int x_inc,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -545,8 +553,8 @@ void cblas_ccopy(const int n,
void cblas_zcopy(const int n,
const void* x, const int x_inc,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -570,8 +578,8 @@ void cblas_saxpy(const int n,
const float alpha,
const float* x, const int x_inc,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -595,8 +603,8 @@ void cblas_daxpy(const int n,
const double alpha,
const double* x, const int x_inc,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -620,8 +628,8 @@ void cblas_caxpy(const int n,
const void* alpha,
const void* x, const int x_inc,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -645,8 +653,8 @@ void cblas_zaxpy(const int n,
const void* alpha,
const void* x, const int x_inc,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -671,8 +679,8 @@ void cblas_zaxpy(const int n,
float cblas_sdot(const int n,
const float* x, const int x_inc,
const float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -698,8 +706,8 @@ float cblas_sdot(const int n,
double cblas_ddot(const int n,
const double* x, const int x_inc,
const double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -728,8 +736,8 @@ void cblas_cdotu_sub(const int n,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* dot) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -754,8 +762,8 @@ void cblas_zdotu_sub(const int n,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* dot) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -782,8 +790,8 @@ void cblas_cdotc_sub(const int n,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* dot) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -808,8 +816,8 @@ void cblas_zdotc_sub(const int n,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* dot) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto y_size = n * y_inc;
@@ -834,8 +842,8 @@ void cblas_zdotc_sub(const int n,
// NRM2
float cblas_snrm2(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto nrm2_size = 1;
@@ -856,8 +864,8 @@ float cblas_snrm2(const int n,
}
double cblas_dnrm2(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto nrm2_size = 1;
@@ -878,8 +886,8 @@ double cblas_dnrm2(const int n,
}
float cblas_scnrm2(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto nrm2_size = 1;
@@ -900,8 +908,8 @@ float cblas_scnrm2(const int n,
}
double cblas_dznrm2(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto nrm2_size = 1;
@@ -924,8 +932,8 @@ double cblas_dznrm2(const int n,
// ASUM
float cblas_sasum(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto asum_size = 1;
@@ -946,8 +954,8 @@ float cblas_sasum(const int n,
}
double cblas_dasum(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto asum_size = 1;
@@ -968,8 +976,8 @@ double cblas_dasum(const int n,
}
float cblas_scasum(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto asum_size = 1;
@@ -990,8 +998,8 @@ float cblas_scasum(const int n,
}
double cblas_dzasum(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto asum_size = 1;
@@ -1014,8 +1022,8 @@ double cblas_dzasum(const int n,
// SUM
float cblas_ssum(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto sum_size = 1;
@@ -1036,8 +1044,8 @@ float cblas_ssum(const int n,
}
double cblas_dsum(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto sum_size = 1;
@@ -1058,8 +1066,8 @@ double cblas_dsum(const int n,
}
float cblas_scsum(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto sum_size = 1;
@@ -1080,8 +1088,8 @@ float cblas_scsum(const int n,
}
double cblas_dzsum(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto sum_size = 1;
@@ -1104,8 +1112,8 @@ double cblas_dzsum(const int n,
// AMAX
int cblas_isamax(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1126,8 +1134,8 @@ int cblas_isamax(const int n,
}
int cblas_idamax(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1148,8 +1156,8 @@ int cblas_idamax(const int n,
}
int cblas_icamax(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1170,8 +1178,8 @@ int cblas_icamax(const int n,
}
int cblas_izamax(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1194,8 +1202,8 @@ int cblas_izamax(const int n,
// AMIN
int cblas_isamin(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1216,8 +1224,8 @@ int cblas_isamin(const int n,
}
int cblas_idamin(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1238,8 +1246,8 @@ int cblas_idamin(const int n,
}
int cblas_icamin(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1260,8 +1268,8 @@ int cblas_icamin(const int n,
}
int cblas_izamin(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1284,8 +1292,8 @@ int cblas_izamin(const int n,
// MAX
int cblas_ismax(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1306,8 +1314,8 @@ int cblas_ismax(const int n,
}
int cblas_idmax(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1328,8 +1336,8 @@ int cblas_idmax(const int n,
}
int cblas_icmax(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1350,8 +1358,8 @@ int cblas_icmax(const int n,
}
int cblas_izmax(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imax_size = 1;
@@ -1374,8 +1382,8 @@ int cblas_izmax(const int n,
// MIN
int cblas_ismin(const int n,
const float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1396,8 +1404,8 @@ int cblas_ismin(const int n,
}
int cblas_idmin(const int n,
const double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1418,8 +1426,8 @@ int cblas_idmin(const int n,
}
int cblas_icmin(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1440,8 +1448,8 @@ int cblas_icmin(const int n,
}
int cblas_izmin(const int n,
const void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto x_size = n * x_inc;
const auto imin_size = 1;
@@ -1473,8 +1481,8 @@ void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -1509,8 +1517,8 @@ void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -1545,8 +1553,8 @@ void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -1581,8 +1589,8 @@ void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -1619,8 +1627,8 @@ void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -1655,8 +1663,8 @@ void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -1691,8 +1699,8 @@ void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -1727,8 +1735,8 @@ void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -1765,8 +1773,8 @@ void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -1801,8 +1809,8 @@ void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -1839,8 +1847,8 @@ void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -1875,8 +1883,8 @@ void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -1913,8 +1921,8 @@ void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -1949,8 +1957,8 @@ void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -1987,8 +1995,8 @@ void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -2023,8 +2031,8 @@ void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -2061,8 +2069,8 @@ void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -2097,8 +2105,8 @@ void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -2135,8 +2143,8 @@ void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -2171,8 +2179,8 @@ void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -2206,8 +2214,8 @@ void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const float* a, const int a_ld,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2233,8 +2241,8 @@ void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const double* a, const int a_ld,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2260,8 +2268,8 @@ void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2287,8 +2295,8 @@ void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2316,8 +2324,8 @@ void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const float* a, const int a_ld,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2343,8 +2351,8 @@ void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const double* a, const int a_ld,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2370,8 +2378,8 @@ void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2397,8 +2405,8 @@ void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2426,8 +2434,8 @@ void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const float* ap,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2453,8 +2461,8 @@ void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const double* ap,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2480,8 +2488,8 @@ void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* ap,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2507,8 +2515,8 @@ void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* ap,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2536,8 +2544,8 @@ void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const float* a, const int a_ld,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2563,8 +2571,8 @@ void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const double* a, const int a_ld,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2590,8 +2598,8 @@ void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2617,8 +2625,8 @@ void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2646,8 +2654,8 @@ void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const float* a, const int a_ld,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2673,8 +2681,8 @@ void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const double* a, const int a_ld,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2700,8 +2708,8 @@ void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2727,8 +2735,8 @@ void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto a_size = n * a_ld;
const auto x_size = n * x_inc;
@@ -2756,8 +2764,8 @@ void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const float* ap,
float* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2783,8 +2791,8 @@ void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const double* ap,
double* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2810,8 +2818,8 @@ void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* ap,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2837,8 +2845,8 @@ void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const int n,
const void* ap,
void* x, const int x_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto ap_size = ((n*(n+1)) / 2);
const auto x_size = n * x_inc;
@@ -2868,8 +2876,8 @@ void cblas_sger(const CLBlastLayout layout,
const float* x, const int x_inc,
const float* y, const int y_inc,
float* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = m * x_inc;
@@ -2900,8 +2908,8 @@ void cblas_dger(const CLBlastLayout layout,
const double* x, const int x_inc,
const double* y, const int y_inc,
double* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = m * x_inc;
@@ -2934,8 +2942,8 @@ void cblas_cgeru(const CLBlastLayout layout,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto x_size = m * x_inc;
@@ -2966,8 +2974,8 @@ void cblas_zgeru(const CLBlastLayout layout,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto x_size = m * x_inc;
@@ -3000,8 +3008,8 @@ void cblas_cgerc(const CLBlastLayout layout,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto x_size = m * x_inc;
@@ -3032,8 +3040,8 @@ void cblas_zgerc(const CLBlastLayout layout,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto x_size = m * x_inc;
@@ -3065,8 +3073,8 @@ void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float alpha,
const void* x, const int x_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3093,8 +3101,8 @@ void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double alpha,
const void* x, const int x_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3123,8 +3131,8 @@ void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float alpha,
const void* x, const int x_inc,
void* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3151,8 +3159,8 @@ void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double alpha,
const void* x, const int x_inc,
void* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3182,8 +3190,8 @@ void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -3215,8 +3223,8 @@ void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -3250,8 +3258,8 @@ void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -3283,8 +3291,8 @@ void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto x_size = n * x_inc;
@@ -3317,8 +3325,8 @@ void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float alpha,
const float* x, const int x_inc,
float* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3345,8 +3353,8 @@ void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double alpha,
const double* x, const int x_inc,
double* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3375,8 +3383,8 @@ void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float alpha,
const float* x, const int x_inc,
float* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3403,8 +3411,8 @@ void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double alpha,
const double* x, const int x_inc,
double* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3434,8 +3442,8 @@ void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float* x, const int x_inc,
const float* y, const int y_inc,
float* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3467,8 +3475,8 @@ void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double* x, const int x_inc,
const double* y, const int y_inc,
double* a, const int a_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3502,8 +3510,8 @@ void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const float* x, const int x_inc,
const float* y, const int y_inc,
float* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3535,8 +3543,8 @@ void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const double* x, const int x_inc,
const double* y, const int y_inc,
double* ap) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto x_size = n * x_inc;
@@ -3575,8 +3583,8 @@ void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const float* b, const int b_ld,
const float beta,
float* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -3612,8 +3620,8 @@ void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const double* b, const int b_ld,
const double beta,
double* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -3649,8 +3657,8 @@ void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -3686,8 +3694,8 @@ void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -3725,8 +3733,8 @@ void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const float* b, const int b_ld,
const float beta,
float* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -3762,8 +3770,8 @@ void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const double* b, const int b_ld,
const double beta,
double* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -3799,8 +3807,8 @@ void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -3836,8 +3844,8 @@ void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -3875,8 +3883,8 @@ void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -3912,8 +3920,8 @@ void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -3950,8 +3958,8 @@ void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const float* a, const int a_ld,
const float beta,
float* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -3982,8 +3990,8 @@ void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const double* a, const int a_ld,
const double beta,
double* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4014,8 +4022,8 @@ void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const void* a, const int a_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -4046,8 +4054,8 @@ void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const void* a, const int a_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -4080,8 +4088,8 @@ void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const void* a, const int a_ld,
const float beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4112,8 +4120,8 @@ void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con
const void* a, const int a_ld,
const double beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4147,8 +4155,8 @@ void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co
const float* b, const int b_ld,
const float beta,
float* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4184,8 +4192,8 @@ void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co
const double* b, const int b_ld,
const double beta,
double* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4221,8 +4229,8 @@ void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -4258,8 +4266,8 @@ void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -4297,8 +4305,8 @@ void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co
const void* b, const int b_ld,
const float beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = beta;
@@ -4334,8 +4342,8 @@ void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co
const void* b, const int b_ld,
const double beta,
void* c, const int c_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = beta;
@@ -4371,8 +4379,8 @@ void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const float alpha,
const float* a, const int a_ld,
float* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4402,8 +4410,8 @@ void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const double alpha,
const double* a, const int a_ld,
double* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4433,8 +4441,8 @@ void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4464,8 +4472,8 @@ void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4497,8 +4505,8 @@ void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const float alpha,
const float* a, const int a_ld,
float* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4528,8 +4536,8 @@ void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const double alpha,
const double* a, const int a_ld,
double* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4559,8 +4567,8 @@ void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4590,8 +4598,8 @@ void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld;
@@ -4628,8 +4636,8 @@ void cblas_shad(const int n,
const float* y, const int y_inc,
const float beta,
float* z, const int z_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4661,8 +4669,8 @@ void cblas_dhad(const int n,
const double* y, const int y_inc,
const double beta,
double* z, const int z_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
@@ -4694,8 +4702,8 @@ void cblas_chad(const int n,
const void* y, const int y_inc,
const void* beta,
void* z, const int z_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
@@ -4727,8 +4735,8 @@ void cblas_zhad(const int n,
const void* y, const int y_inc,
const void* beta,
void* z, const int z_inc) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
@@ -4761,8 +4769,8 @@ void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp
const float alpha,
const float* a, const int a_ld,
float* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
@@ -4789,8 +4797,8 @@ void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp
const double alpha,
const double* a, const int a_ld,
double* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = alpha;
const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
@@ -4817,8 +4825,8 @@ void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
@@ -4845,8 +4853,8 @@ void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld;
@@ -4873,8 +4881,8 @@ void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp
void cblas_sim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const float* im,
float* col) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto im_size = height * width * channels;
const auto col_size = height * width * channels;
@@ -4895,8 +4903,8 @@ void cblas_sim2col(const int channels, const int height, const int width, const
void cblas_dim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const double* im,
double* col) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto im_size = height * width * channels;
const auto col_size = height * width * channels;
@@ -4917,8 +4925,8 @@ void cblas_dim2col(const int channels, const int height, const int width, const
void cblas_cim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const void* im,
void* col) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto im_size = height * width * channels;
const auto col_size = height * width * channels;
@@ -4939,8 +4947,8 @@ void cblas_cim2col(const int channels, const int height, const int width, const
void cblas_zim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const void* im,
void* col) {
- auto device = get_device();
- auto context = clblast::Context(device);
+ OPTIONAL_STATIC auto device = get_device();
+ OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto im_size = height * width * channels;
const auto col_size = height * width * channels;
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 8ac0523f..94464990 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -447,8 +447,14 @@ class Program {
// Source-based constructor with memory management
explicit Program(const Context &context, const std::string &source) {
- const char *source_ptr = &source[0];
- const auto length = source.length();
+ #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+ const std::string source_null_kernel = source + "\n__kernel void null_kernel() {}\n";
+ const char *source_ptr = &source_null_kernel[0];
+ const auto length = source_null_kernel.length();
+ #else
+ const char *source_ptr = &source[0];
+ const auto length = source.length();
+ #endif
auto status = CL_SUCCESS;
program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
CLCudaAPIError::Check(status, "clCreateProgramWithSource");
@@ -723,9 +729,10 @@ class Buffer {
}
// Copies the contents of this buffer into another device buffer
- void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
+ void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination,
+ EventPointer event = nullptr) const {
CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0,
- nullptr, nullptr));
+ nullptr, event));
}
void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
CopyToAsync(queue, size, destination);
@@ -764,10 +771,21 @@ class Kernel {
kernel_(new cl_kernel, [](cl_kernel* k) {
if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
delete k;
- }) {
+ })
+ #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+ , null_kernel_(new cl_kernel, [](cl_kernel* k) {
+ if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
+ delete k;
+ })
+ #endif
+ {
auto status = CL_SUCCESS;
*kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status);
CLCudaAPIError::Check(status, "clCreateKernel");
+ #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+ *null_kernel_ = clCreateKernel(program->operator()(), "null_kernel", &status);
+ CLCudaAPIError::Check(status, "clCreateKernel");
+ #endif
}
// Sets a kernel argument at the indicated position
@@ -831,12 +849,21 @@ class Kernel {
static_cast<cl_uint>(waitForEventsPlain.size()),
!waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr,
event));
+ #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+ const std::vector<size_t> nullRange = {1};
+ CheckError(clEnqueueNDRangeKernel(queue(), *null_kernel_, static_cast<cl_uint>(nullRange.size()),
+ nullptr, nullRange.data(), nullptr,
+ 0, nullptr, nullptr));
+ #endif
}
// Accessor to the private data-member
const cl_kernel& operator()() const { return *kernel_; }
private:
std::shared_ptr<cl_kernel> kernel_;
+ #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+ std::shared_ptr<cl_kernel> null_kernel_;
+ #endif
// Internal implementation for the recursive SetArguments function.
template <typename T>
diff --git a/src/database/kernels/copy/copy_3232.hpp b/src/database/kernels/copy/copy_3232.hpp
index 64d56a7b..83ba8106 100644
--- a/src/database/kernels/copy/copy_3232.hpp
+++ b/src/database/kernels/copy/copy_3232.hpp
@@ -88,6 +88,7 @@ const DatabaseEntry CopyComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 16, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 32, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/gemm_routine/gemm_routine_32.hpp b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
index b685d4bc..ba0cc5a3 100644
--- a/src/database/kernels/gemm_routine/gemm_routine_32.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
@@ -33,6 +33,7 @@ const DatabaseEntry GemmRoutineSingle = {
kDeviceTypeGPU, "Intel", {
{ "default", {
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
@@ -62,7 +63,7 @@ const DatabaseEntry GemmRoutineSingle = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
index c72db083..9977bb78 100644
--- a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
@@ -24,6 +24,7 @@ const DatabaseEntry GemmRoutineComplexSingle = {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "default", {
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
@@ -49,7 +50,7 @@ const DatabaseEntry GemmRoutineComplexSingle = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/invert/invert_32.hpp b/src/database/kernels/invert/invert_32.hpp
index b3f9143a..d550e3ba 100644
--- a/src/database/kernels/invert/invert_32.hpp
+++ b/src/database/kernels/invert/invert_32.hpp
@@ -24,6 +24,7 @@ const DatabaseEntry InvertSingle = {
kDeviceTypeGPU, "Intel", {
{ "default", {
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/invert/invert_3232.hpp b/src/database/kernels/invert/invert_3232.hpp
index 11ea895d..d1103909 100644
--- a/src/database/kernels/invert/invert_3232.hpp
+++ b/src/database/kernels/invert/invert_3232.hpp
@@ -23,6 +23,7 @@ const DatabaseEntry InvertComplexSingle = {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "default", {
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/pad/pad_3232.hpp b/src/database/kernels/pad/pad_3232.hpp
index 88ae08a3..08ed21a4 100644
--- a/src/database/kernels/pad/pad_3232.hpp
+++ b/src/database/kernels/pad/pad_3232.hpp
@@ -88,6 +88,7 @@ const DatabaseEntry PadComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 32, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/padtranspose/padtranspose_3232.hpp b/src/database/kernels/padtranspose/padtranspose_3232.hpp
index fb0ec5d0..32506c1e 100644
--- a/src/database/kernels/padtranspose/padtranspose_3232.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_3232.hpp
@@ -88,10 +88,11 @@ const DatabaseEntry PadtransposeComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/transpose/transpose_3232.hpp b/src/database/kernels/transpose/transpose_3232.hpp
index 45b2c3ff..b0de29f1 100644
--- a/src/database/kernels/transpose/transpose_3232.hpp
+++ b/src/database/kernels/transpose/transpose_3232.hpp
@@ -88,6 +88,7 @@ const DatabaseEntry TransposeComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/trsv_routine/trsv_routine_32.hpp b/src/database/kernels/trsv_routine/trsv_routine_32.hpp
index 2ee82b71..7fc71f5e 100644
--- a/src/database/kernels/trsv_routine/trsv_routine_32.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_32.hpp
@@ -24,6 +24,7 @@ const DatabaseEntry TrsvRoutineSingle = {
kDeviceTypeGPU, "Intel", {
{ "default", {
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
index 6f2f9306..221f12cf 100644
--- a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
@@ -23,6 +23,7 @@ const DatabaseEntry TrsvRoutineComplexSingle = {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "default", {
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xaxpy/xaxpy_3232.hpp b/src/database/kernels/xaxpy/xaxpy_3232.hpp
index 4a29da4d..25cd3630 100644
--- a/src/database/kernels/xaxpy/xaxpy_3232.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_3232.hpp
@@ -88,6 +88,7 @@ const DatabaseEntry XaxpyComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xdot/xdot_32.hpp b/src/database/kernels/xdot/xdot_32.hpp
index a7e685bf..2b7bc724 100644
--- a/src/database/kernels/xdot/xdot_32.hpp
+++ b/src/database/kernels/xdot/xdot_32.hpp
@@ -84,7 +84,7 @@ const DatabaseEntry XdotSingle = {
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 512, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xdot/xdot_3232.hpp b/src/database/kernels/xdot/xdot_3232.hpp
index ad2cf414..e655e17e 100644
--- a/src/database/kernels/xdot/xdot_3232.hpp
+++ b/src/database/kernels/xdot/xdot_3232.hpp
@@ -82,6 +82,7 @@ const DatabaseEntry XdotComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemm/xgemm_32.hpp b/src/database/kernels/xgemm/xgemm_32.hpp
index 32358dbc..d75758ea 100644
--- a/src/database/kernels/xgemm/xgemm_32.hpp
+++ b/src/database/kernels/xgemm/xgemm_32.hpp
@@ -90,7 +90,7 @@ const DatabaseEntry XgemmSingle = {
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 1, 4, 1, 1, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 0, 1, 32, 2, 32, 8, 64, 16, 16, 128, 0, 0, 0, 1, 1, 2 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 16, 128, 1, 1, 1, 1, 2, 4 } },
- { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 16, 128, 1, 1, 0, 1, 1, 4 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 2, 1, 1, 4, 4, 32, 8, 8, 64, 0, 0, 0, 0, 2, 2 } },
{ Name{"Iris "}, Params{ 0, 1, 16, 8, 16, 8, 128, 32, 16, 64, 1, 1, 1, 1, 4, 1 } },
{ Name{"Iris Pro "}, Params{ 0, 1, 16, 2, 16, 8, 64, 32, 32, 128, 1, 1, 1, 0, 4, 4 } },
diff --git a/src/database/kernels/xgemm/xgemm_3232.hpp b/src/database/kernels/xgemm/xgemm_3232.hpp
index 22959347..9c0b70b1 100644
--- a/src/database/kernels/xgemm/xgemm_3232.hpp
+++ b/src/database/kernels/xgemm/xgemm_3232.hpp
@@ -88,10 +88,11 @@ const DatabaseEntry XgemmComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 1, 16, 8, 8, 8, 32, 16, 16, 64, 1, 0, 0, 0, 4, 4 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 8, 32, 0, 0, 1, 0, 1, 1 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 0, 1, 32, 8, 16, 16, 64, 16, 16, 64, 1, 1, 1, 1, 2, 1 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 4, 1, 1, 32, 32, 128, 16, 16, 128, 0, 0, 0, 0, 4, 1 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 2, 1, 1, 4, 4, 16, 8, 8, 64, 0, 0, 0, 0, 2, 2 } },
{ Name{"Iris "}, Params{ 0, 1, 32, 8, 32, 16, 64, 8, 16, 64, 1, 0, 1, 0, 1, 1 } },
{ Name{"Iris Pro "}, Params{ 0, 1, 16, 2, 8, 8, 32, 32, 8, 32, 1, 1, 1, 1, 1, 1 } },
- { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 32, 32, 128, 1, 1, 1, 0, 2, 2 } },
+ { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 8, 32, 8, 8, 32, 0, 0, 1, 0, 1, 1 } },
} },
}
},
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
index f6ea9523..146018d5 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
@@ -69,7 +69,7 @@ const DatabaseEntry XgemmDirectSingle = {
{ "default", {
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } },
- { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 16, 8, 16, 16, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 16, 32, 16, 8, 1, 0, 1, 1, 64, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
index 8f24ee7d..6dd95b38 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
@@ -63,9 +63,10 @@ const DatabaseEntry XgemmDirectComplexSingle = {
kDeviceTypeGPU, "Intel", {
{ "default", {
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemv/xgemv_3232.hpp b/src/database/kernels/xgemv/xgemv_3232.hpp
index dc9a0a88..2b4328b3 100644
--- a/src/database/kernels/xgemv/xgemv_3232.hpp
+++ b/src/database/kernels/xgemv/xgemv_3232.hpp
@@ -86,6 +86,7 @@ const DatabaseEntry XgemvComplexSingle = {
{ Name{"Intel(R) HD Graphics 530 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
index 146bd466..351973ae 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
@@ -92,7 +92,7 @@ const DatabaseEntry XgemvFastSingle = {
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
index 693fac4e..e9928b28 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
@@ -86,6 +86,7 @@ const DatabaseEntry XgemvFastComplexSingle = {
{ Name{"Intel(R) HD Graphics 530 "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
index 42e7a36d..cf1b4e55 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
@@ -70,7 +70,7 @@ const DatabaseEntry XgemvFastRotSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 128, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
index 98d5cf6a..bf780835 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
@@ -68,6 +68,7 @@ const DatabaseEntry XgemvFastRotComplexSingle = {
{ "default", {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 4, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xger/xger_3232.hpp b/src/database/kernels/xger/xger_3232.hpp
index 123fc4fa..763f2ca6 100644
--- a/src/database/kernels/xger/xger_3232.hpp
+++ b/src/database/kernels/xger/xger_3232.hpp
@@ -87,6 +87,7 @@ const DatabaseEntry XgerComplexSingle = {
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/kernel_preprocessor.cpp b/src/kernel_preprocessor.cpp
index aa946bab..1c422d33 100644
--- a/src/kernel_preprocessor.cpp
+++ b/src/kernel_preprocessor.cpp
@@ -557,6 +557,8 @@ std::string PreprocessKernelSource(const std::string& kernel_source) {
lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers);
lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false);
lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false);
+ lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false);
+ lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false);
lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, true);
// Gather the results
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 3cfc5dfb..80a60107 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -43,8 +43,6 @@
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
-// =================================================================================================
-
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef GEMMK
@@ -397,9 +395,7 @@ INLINE_FUNC realN LocalToPrivateB(LOCAL_PTR realN* blm, const int _ni, const int
}
#endif
-// =================================================================================================
-
-// End of the C++11 raw string literal
)"
+// End of the C++11 raw string literal
// =================================================================================================
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 17c8955a..ee4d5da5 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -15,8 +15,6 @@
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
-// =================================================================================================
-
// The vectorised multiply-add function
INLINE_FUNC realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
#if USE_VECTOR_MAD == 1
@@ -171,9 +169,7 @@ INLINE_FUNC void StoreResults(__global realM* cgm, realM c_value, const int _mi,
cgm[index] = result;
}
-// =================================================================================================
-
-// End of the C++11 raw string literal
)"
+// End of the C++11 raw string literal
// =================================================================================================
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index 90de0b3b..77964a94 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -15,14 +15,12 @@
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
-// =================================================================================================
-
// A common interface for subgroup functions
#if USE_SUBGROUP_SHUFFLING == 1
INLINE_FUNC int clblast_get_sub_group_local_id() {
-
+
// Intel extension
#if SUBGROUP_SHUFFLING_INTEL == 1
return get_sub_group_local_id();
@@ -36,7 +34,7 @@ INLINE_FUNC int clblast_get_sub_group_local_id() {
}
INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
-
+
// Intel extension
#if SUBGROUP_SHUFFLING_INTEL == 1
return intel_sub_group_shuffle(reg, src);
@@ -238,48 +236,47 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
#pragma unroll
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
- const int index = _ni * (MWI/VWM) + _mi;
#if USE_SUBGROUP_SHUFFLING == 1
const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni);
#else
const realN aval = apm[_ni * (KREG/VWN) + _ki];
#endif
#if VWN == 1
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval);
#elif VWN == 2
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y);
#elif VWN == 4
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.z);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.w);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.z);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.w);
#elif VWN == 8
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.s0);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.s1);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.s2);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.s3);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 4) * (MWI/VWM) + _mi], aval.s4);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 5) * (MWI/VWM) + _mi], aval.s5);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 6) * (MWI/VWM) + _mi], aval.s6);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 7) * (MWI/VWM) + _mi], aval.s7);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.s0);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.s1);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.s2);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.s3);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 4) * (MWI/VWM) + _mi], aval.s4);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 5) * (MWI/VWM) + _mi], aval.s5);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 6) * (MWI/VWM) + _mi], aval.s6);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 7) * (MWI/VWM) + _mi], aval.s7);
#elif VWN == 16
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0 ) * (MWI/VWM) + _mi], aval.s0);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1 ) * (MWI/VWM) + _mi], aval.s1);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 2 ) * (MWI/VWM) + _mi], aval.s2);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 3 ) * (MWI/VWM) + _mi], aval.s3);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 4 ) * (MWI/VWM) + _mi], aval.s4);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 5 ) * (MWI/VWM) + _mi], aval.s5);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 6 ) * (MWI/VWM) + _mi], aval.s6);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 7 ) * (MWI/VWM) + _mi], aval.s7);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 8 ) * (MWI/VWM) + _mi], aval.s8);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 9 ) * (MWI/VWM) + _mi], aval.s9);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 10) * (MWI/VWM) + _mi], aval.sA);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 11) * (MWI/VWM) + _mi], aval.sB);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 12) * (MWI/VWM) + _mi], aval.sC);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 13) * (MWI/VWM) + _mi], aval.sD);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 14) * (MWI/VWM) + _mi], aval.sE);
- cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 15) * (MWI/VWM) + _mi], aval.sF);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0 ) * (MWI/VWM) + _mi], aval.s0);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1 ) * (MWI/VWM) + _mi], aval.s1);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2 ) * (MWI/VWM) + _mi], aval.s2);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3 ) * (MWI/VWM) + _mi], aval.s3);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 4 ) * (MWI/VWM) + _mi], aval.s4);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 5 ) * (MWI/VWM) + _mi], aval.s5);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 6 ) * (MWI/VWM) + _mi], aval.s6);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 7 ) * (MWI/VWM) + _mi], aval.s7);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 8 ) * (MWI/VWM) + _mi], aval.s8);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 9 ) * (MWI/VWM) + _mi], aval.s9);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 10) * (MWI/VWM) + _mi], aval.sA);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 11) * (MWI/VWM) + _mi], aval.sB);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 12) * (MWI/VWM) + _mi], aval.sC);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 13) * (MWI/VWM) + _mi], aval.sD);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 14) * (MWI/VWM) + _mi], aval.sE);
+ cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 15) * (MWI/VWM) + _mi], aval.sF);
#endif
}
}
@@ -311,9 +308,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
}
}
-// =================================================================================================
-
-// End of the C++11 raw string literal
)"
+// End of the C++11 raw string literal
// =================================================================================================
diff --git a/src/kernels/level3/xgemm_part4.opencl b/src/kernels/level3/xgemm_part4.opencl
index e581cd84..b1f1ade6 100644
--- a/src/kernels/level3/xgemm_part4.opencl
+++ b/src/kernels/level3/xgemm_part4.opencl
@@ -15,7 +15,6 @@
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
-// =================================================================================================
// The upper-triangular and lower-triangular kernels are only used in special cases
#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
@@ -132,9 +131,8 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
}
#endif
-// =================================================================================================
-// End of the C++11 raw string literal
)"
+// End of the C++11 raw string literal
// =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index c30a2e0e..c6db0152 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -76,6 +76,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
// Determines the right kernel
auto kernel_name = std::string{};
+ auto pad_kernel = false;
if (do_transpose) {
if (use_fast_kernel &&
IsMultiple(src_ld, db["TRA_WPT"]) &&
@@ -85,7 +86,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
else {
use_fast_kernel = false;
- kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
+ pad_kernel = (do_pad || do_conjugate);
+ kernel_name = (pad_kernel) ? "TransposePadMatrix" : "TransposeMatrix";
}
}
else {
@@ -97,7 +99,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
else {
use_fast_kernel = false;
- kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
+ pad_kernel = do_pad;
+ kernel_name = (pad_kernel) ? "CopyPadMatrix" : "CopyMatrix";
}
}
@@ -123,7 +126,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, GetRealArg(alpha));
- if (do_pad) {
+ if (pad_kernel) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
index 76401753..2a5a5664 100644
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@@ -33,7 +33,8 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ EventPointer event) {
if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); };
@@ -69,9 +70,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
// Launches the kernel
const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
const auto global = std::vector<size_t>{Ceil(n, db_["TRSV_BLOCK_SIZE"])};
- auto event = Event();
- RunKernel(kernel, queue_, device_, global, local, event.pointer());
- event.WaitForCompletion();
+ RunKernel(kernel, queue_, device_, global, local, event);
}
// =================================================================================================
@@ -146,14 +145,16 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
}
// Runs the triangular substitution for the block size
+ auto sub_event = Event();
Substitution(layout, triangle, a_transpose, diagonal, block_size,
a_buffer, a_offset + col + col*a_ld, a_ld,
b_buffer, b_offset + col*b_inc, b_inc,
- x_buffer, x_offset + col*x_inc, x_inc);
+ x_buffer, x_offset + col*x_inc, x_inc, sub_event.pointer());
+ sub_event.WaitForCompletion();
}
// Retrieves the results
- x_buffer.CopyTo(queue_, x_size, b_buffer);
+ x_buffer.CopyToAsync(queue_, x_size, b_buffer, event_);
}
// =================================================================================================
diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp
index 67e626a1..8a900a35 100644
--- a/src/routines/level2/xtrsv.hpp
+++ b/src/routines/level2/xtrsv.hpp
@@ -32,6 +32,7 @@ class Xtrsv: public Xgemv<T> {
using Xgemv<T>::device_;
using Xgemv<T>::db_;
using Xgemv<T>::program_;
+ using Xgemv<T>::event_;
using Xgemv<T>::DoGemv;
// Constructor
@@ -50,7 +51,8 @@ class Xtrsv: public Xgemv<T> {
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
- const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc);
+ const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc,
+ EventPointer event);
};
// =================================================================================================
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index fd5a20db..cb24460a 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -40,6 +40,7 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ , // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part3.opencl"
#include "../../kernels/level3/xgemm_part4.opencl"
}) {
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index 6912d3a9..2e6f30ec 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -32,6 +32,7 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ , // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part3.opencl"
#include "../../kernels/level3/xgemm_part4.opencl"
}) {
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index 6bb2a24f..5ffdc028 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -32,6 +32,7 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ , // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part3.opencl"
#include "../../kernels/level3/xgemm_part4.opencl"
}) {
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
index 905660ff..fe5d1e14 100644
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@@ -246,7 +246,7 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
}
// Retrieves the results
- x_buffer.CopyTo(queue_, b_size, b_buffer);
+ x_buffer.CopyToAsync(queue_, b_size, b_buffer, event_);
}
// =================================================================================================
diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp
index 5b42398e..871d7253 100644
--- a/src/routines/level3/xtrsm.hpp
+++ b/src/routines/level3/xtrsm.hpp
@@ -31,6 +31,7 @@ class Xtrsm: public Xgemm<T> {
using Xgemm<T>::device_;
using Xgemm<T>::db_;
using Xgemm<T>::program_;
+ using Xgemm<T>::event_;
using Xgemm<T>::DoGemm;
// Constructor
diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp
index 2bbc5007..b12b8734 100644
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@@ -38,6 +38,7 @@ XgemmBatched<T>::XgemmBatched(Queue &queue, EventPointer event, const std::strin
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ , // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part3.opencl"
#include "../../kernels/level3/xgemm_part4.opencl"
, // separated in multiple parts to prevent C1091 in MSVC 2013
diff --git a/src/routines/levelx/xgemmstridedbatched.cpp b/src/routines/levelx/xgemmstridedbatched.cpp
index 30c161cc..d9e3ebba 100644
--- a/src/routines/levelx/xgemmstridedbatched.cpp
+++ b/src/routines/levelx/xgemmstridedbatched.cpp
@@ -37,6 +37,7 @@ XgemmStridedBatched<T>::XgemmStridedBatched(Queue &queue, EventPointer event, co
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ , // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part3.opencl"
#include "../../kernels/level3/xgemm_part4.opencl"
, // separated in multiple parts to prevent C1091 in MSVC 2013
diff --git a/src/tuning/configurations.cpp b/src/tuning/configurations.cpp
index 1fe232cf..82d7e3b4 100644
--- a/src/tuning/configurations.cpp
+++ b/src/tuning/configurations.cpp
@@ -23,28 +23,42 @@ namespace clblast {
// Finds all configurations. It also applies the user-defined constraints within.
std::vector<Configuration> SetConfigurations(const Device& device,
const std::vector<Parameter> parameters,
+ const std::vector<size_t>& local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
const Constraints& constraints,
const LocalMemSizeInfo& local_mem_size_info) {
const auto local_mem_max = device.LocalMemSize();
+ const auto max_work_item_sizes = device.MaxWorkItemSizes();
+ const auto max_work_group_size = device.MaxWorkGroupSize();
auto config = Configuration();
auto configurations = std::vector<Configuration>();
- PopulateConfigurations(parameters, 0, config, configurations,
- local_mem_max, constraints, local_mem_size_info);
+ PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config,
+ 0, config, configurations,
+ local_mem_max, constraints, local_mem_size_info,
+ max_work_item_sizes, max_work_group_size);
return configurations;
}
// Iterates recursively over all permutations of the user-defined parameters
void PopulateConfigurations(const std::vector<Parameter> &parameters,
+ const std::vector<size_t> local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
const size_t index, const Configuration &config,
std::vector<Configuration> &configuration,
const size_t local_mem_max,
const Constraints& constraints,
- const LocalMemSizeInfo& local_mem_size_info) {
+ const LocalMemSizeInfo& local_mem_size_info,
+ const std::vector<size_t>& max_work_item_sizes,
+ const size_t max_work_group_size) {
// End of the chain: all parameters are considered, store the resulting configuration if it is a
// valid one according to the constraints
if (index == parameters.size()) {
- if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info)) {
+ if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info,
+ local_size_base, mul_local_config, div_local_config,
+ max_work_item_sizes, max_work_group_size)) {
configuration.push_back(config);
}
return;
@@ -55,8 +69,10 @@ void PopulateConfigurations(const std::vector<Parameter> &parameters,
for (auto &value: parameter.second) {
auto config_copy = config;
config_copy[parameter.first] = value;
- PopulateConfigurations(parameters, index+1, config_copy, configuration,
- local_mem_max, constraints, local_mem_size_info);
+ PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config,
+ index+1, config_copy, configuration,
+ local_mem_max, constraints, local_mem_size_info,
+ max_work_item_sizes, max_work_group_size);
}
}
@@ -64,7 +80,12 @@ void PopulateConfigurations(const std::vector<Parameter> &parameters,
bool ValidConfiguration(const Configuration &config,
const size_t local_mem_max,
const Constraints& constraints,
- const LocalMemSizeInfo& local_mem_size_info) {
+ const LocalMemSizeInfo& local_mem_size_info,
+ const std::vector<size_t> local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
+ const std::vector<size_t>& max_work_item_sizes,
+ const size_t max_work_group_size) {
// Iterates over all constraints
for (auto &constraint: constraints) {
@@ -92,6 +113,20 @@ bool ValidConfiguration(const Configuration &config,
return false;
}
+ // Checks the local thread size (both per dimension and in total)
+ const auto local = SetThreadConfiguration(config, local_size_base,
+ mul_local_config, div_local_config);
+ for (auto i=size_t{0}; i<local.size(); ++i) {
+ if (local[i] > max_work_item_sizes[i]) {
+ return false;
+ }
+ }
+ auto local_size = size_t{1};
+ for (auto &item: local) { local_size *= item; }
+ if (local_size > max_work_group_size) {
+ return false;
+ }
+
// Everything was OK: this configuration is valid
return true;
}
diff --git a/src/tuning/configurations.hpp b/src/tuning/configurations.hpp
index faa5498f..4b9ba93f 100644
--- a/src/tuning/configurations.hpp
+++ b/src/tuning/configurations.hpp
@@ -50,6 +50,9 @@ struct LocalMemSizeInfo {
// function to find all configurations. It also applies the user-defined constraints within.
std::vector<Configuration> SetConfigurations(const Device& device,
const std::vector<Parameter> parameters,
+ const std::vector<size_t>& local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
const Constraints& constraints,
const LocalMemSizeInfo& local_mem_size_info);
@@ -58,11 +61,16 @@ std::vector<Configuration> SetConfigurations(const Device& device,
// At the end of each chain (when all parameters are considered), the function stores the result
// into the configuration list.
void PopulateConfigurations(const std::vector<Parameter> &parameters,
+ const std::vector<size_t> local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
const size_t index, const Configuration &config,
std::vector<Configuration> &configuration,
const size_t local_mem_max,
const Constraints& constraints,
- const LocalMemSizeInfo& local_mem_size_info);
+ const LocalMemSizeInfo& local_mem_size_info,
+ const std::vector<size_t>& max_work_item_sizes,
+ const size_t max_work_group_size);
// Loops over all user-defined constraints to check whether or not the configuration is valid.
// Assumes initially all configurations are valid, then returns false if one of the constraints has
@@ -71,7 +79,12 @@ void PopulateConfigurations(const std::vector<Parameter> &parameters,
bool ValidConfiguration(const Configuration &config,
const size_t local_mem_max,
const Constraints& constraints,
- const LocalMemSizeInfo& local_mem_size_info);
+ const LocalMemSizeInfo& local_mem_size_info,
+ const std::vector<size_t> local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
+ const std::vector<size_t>& max_work_item_sizes,
+ const size_t max_work_group_size);
// Processes multipliers and dividers to obtain the final thread configuration
std::vector<size_t> SetThreadConfiguration(const Configuration& config,
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 75e776e6..dd907ba4 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -33,9 +33,13 @@ void StartVariation(int argc, char *argv[]) {
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
+ printf("* (1/4) Tuning main GEMM kernel (GEMMK == 0) for fixed set of parameters\n\n");
StartVariation<1>(argc, argv);
+ printf("* (2/4) Tuning main GEMM kernel (GEMMK == 0) for random parameters out of larger set\n\n");
StartVariation<2>(argc, argv);
+ printf("* (3/4) Tuning secondary GEMM kernel (GEMMK == 1) for fixed set of parameters\n\n");
StartVariation<11>(argc, argv);
+ printf("* (4/4) Tuning secondary GEMM kernel (GEMMK == 1) for random parameters out of larger set\n\n");
StartVariation<12>(argc, argv);
return 0;
}
diff --git a/src/tuning/kernels/xgemm.hpp b/src/tuning/kernels/xgemm.hpp
index 9a538c1b..fa1bb6ec 100644
--- a/src/tuning/kernels/xgemm.hpp
+++ b/src/tuning/kernels/xgemm.hpp
@@ -50,6 +50,8 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments<T> &args) {
settings.sources +=
#include "../src/kernels/level3/xgemm_part1.opencl"
#include "../src/kernels/level3/xgemm_part2.opencl"
+ ;
+ settings.sources +=
#include "../src/kernels/level3/xgemm_part3.opencl"
#include "../src/kernels/level3/xgemm_part4.opencl"
;
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index 822f8851..d382fb18 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -172,7 +172,8 @@ void Tuner(int argc, char* argv[], const int V,
}
// Sets the tunable parameters and their possible values
- auto configurations = SetConfigurations(device, settings.parameters,
+ auto configurations = SetConfigurations(device, settings.parameters, settings.local_size,
+ settings.mul_local, settings.div_local,
SetConstraints(V), ComputeLocalMemSize(V));
printf("* Found %s%zu configuration(s)%s\n",
kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str());
diff --git a/src/tuning/tuning_api.cpp b/src/tuning/tuning_api.cpp
index 2eec2e2e..2cc9b786 100644
--- a/src/tuning/tuning_api.cpp
+++ b/src/tuning/tuning_api.cpp
@@ -264,7 +264,8 @@ StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V,
}
// Sets the tunable parameters and their possible values
- auto configurations = SetConfigurations(device, settings.parameters,
+ auto configurations = SetConfigurations(device, settings.parameters, settings.local_size,
+ settings.mul_local, settings.div_local,
SetConstraints(V), ComputeLocalMemSize(V));
// Select the search method (full search or a random fraction)
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index 835f54b4..00cb90cb 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -59,7 +59,8 @@ std::shared_ptr<Program> CompileFromSource(
}
// For Intel GPUs with subgroup support, use subgroup shuffling.
- if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) {
+ if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups) &&
+ (precision == Precision::kSingle || precision == Precision::kHalf)) {
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n";
}