diff options
Diffstat (limited to 'src')
47 files changed, 531 insertions, 422 deletions
diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp index 9ab663be..dbc2ba57 100644 --- a/src/clblast_netlib_c.cpp +++ b/src/clblast_netlib_c.cpp @@ -23,6 +23,14 @@ using float2 = clblast::float2; using double2 = clblast::double2; +// Option to make OpenCL device and context static to avoid re-creation upon multiple calls to the +// Netlib API. Disadvantage is that they are not cleaned-up until program termination. +#ifdef NETLIB_PERSISTENT_OPENCL + #define OPTIONAL_STATIC static +#else + #define OPTIONAL_STATIC +#endif + // Helper function to get a default OpenCL platform and device clblast::Device get_device() { auto platform_id = clblast::ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}); @@ -40,8 +48,8 @@ void cblas_srotg(float* sa, float* sb, float* sc, float* ss) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto sa_size = 1; const auto sb_size = 1; @@ -73,8 +81,8 @@ void cblas_drotg(double* sa, double* sb, double* sc, double* ss) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto sa_size = 1; const auto sb_size = 1; @@ -109,8 +117,8 @@ void cblas_srotmg(float* sd1, float* sx1, const float sy1, float* sparam) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto sy1_size = 1; const auto sd1_size = 1; @@ -148,8 +156,8 @@ void cblas_drotmg(double* sd1, double* sx1, const double sy1, double* sparam) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto sy1_size = 1; const auto sd1_size = 1; @@ -189,8 +197,8 @@ void cblas_srot(const int n, float* y, const int y_inc, const float cos, const float sin) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -216,8 +224,8 @@ void cblas_drot(const int n, double* y, const int y_inc, const double cos, const double sin) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -244,8 +252,8 @@ void cblas_srotm(const int n, float* x, const int x_inc, float* y, const int y_inc, float* sparam) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -273,8 +281,8 @@ void cblas_drotm(const int n, double* x, const int x_inc, double* y, const int y_inc, double* sparam) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -303,8 +311,8 @@ void cblas_drotm(const int n, void cblas_sswap(const int n, float* x, const int x_inc, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -326,8 +334,8 @@ void cblas_sswap(const int n, void cblas_dswap(const int n, double* x, const int x_inc, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -349,8 +357,8 @@ void cblas_dswap(const int n, void cblas_cswap(const int n, void* x, const int x_inc, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -372,8 +380,8 @@ void cblas_cswap(const int n, void cblas_zswap(const int n, void* x, const int x_inc, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -397,8 +405,8 @@ void cblas_zswap(const int n, void cblas_sscal(const int n, const float alpha, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -417,8 +425,8 @@ void cblas_sscal(const int n, void cblas_dscal(const int n, const double alpha, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -437,8 +445,8 @@ void cblas_dscal(const int n, void cblas_cscal(const int n, const void* alpha, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto x_size = n * x_inc; @@ -457,8 +465,8 @@ void cblas_cscal(const int n, void cblas_zscal(const int n, const void* alpha, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto x_size = n * x_inc; @@ -479,8 +487,8 @@ void cblas_zscal(const int n, void cblas_scopy(const int n, const float* x, const int x_inc, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -501,8 +509,8 @@ void cblas_scopy(const int n, void cblas_dcopy(const int n, const double* x, const int x_inc, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -523,8 +531,8 @@ void cblas_dcopy(const int n, void cblas_ccopy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -545,8 +553,8 @@ void cblas_ccopy(const int n, void cblas_zcopy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -570,8 +578,8 @@ void cblas_saxpy(const int n, const float alpha, const float* x, const int x_inc, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -595,8 +603,8 @@ void cblas_daxpy(const int n, const double alpha, const double* x, const int x_inc, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -620,8 +628,8 @@ void cblas_caxpy(const int n, const void* alpha, const void* x, const int x_inc, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto x_size = n * x_inc; @@ -645,8 +653,8 @@ void cblas_zaxpy(const int n, const void* alpha, const void* x, const int x_inc, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto x_size = n * x_inc; @@ -671,8 +679,8 @@ void cblas_zaxpy(const int n, float cblas_sdot(const int n, const float* x, const int x_inc, const float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -698,8 +706,8 @@ float cblas_sdot(const int n, double cblas_ddot(const int n, const double* x, const int x_inc, const double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -728,8 +736,8 @@ void cblas_cdotu_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -754,8 +762,8 @@ void cblas_zdotu_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -782,8 +790,8 @@ void cblas_cdotc_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -808,8 +816,8 @@ void cblas_zdotc_sub(const int n, const void* x, const int x_inc, const void* y, const int y_inc, void* dot) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto y_size = n * y_inc; @@ -834,8 +842,8 @@ void cblas_zdotc_sub(const int n, // NRM2 float cblas_snrm2(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto nrm2_size = 1; @@ -856,8 +864,8 @@ float cblas_snrm2(const int n, } double cblas_dnrm2(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto nrm2_size = 1; @@ -878,8 +886,8 @@ double cblas_dnrm2(const int n, } float cblas_scnrm2(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto nrm2_size = 1; @@ -900,8 +908,8 @@ float cblas_scnrm2(const int n, } double cblas_dznrm2(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto nrm2_size = 1; @@ -924,8 +932,8 @@ double cblas_dznrm2(const int n, // ASUM float cblas_sasum(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto asum_size = 1; @@ -946,8 +954,8 @@ float cblas_sasum(const int n, } double cblas_dasum(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto asum_size = 1; @@ -968,8 +976,8 @@ double cblas_dasum(const int n, } float cblas_scasum(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto asum_size = 1; @@ -990,8 +998,8 @@ float cblas_scasum(const int n, } double cblas_dzasum(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto asum_size = 1; @@ -1014,8 +1022,8 @@ double cblas_dzasum(const int n, // SUM float cblas_ssum(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto sum_size = 1; @@ -1036,8 +1044,8 @@ float cblas_ssum(const int n, } double cblas_dsum(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto sum_size = 1; @@ -1058,8 +1066,8 @@ double cblas_dsum(const int n, } float cblas_scsum(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto sum_size = 1; @@ -1080,8 +1088,8 @@ float cblas_scsum(const int n, } double cblas_dzsum(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto sum_size = 1; @@ -1104,8 +1112,8 @@ double cblas_dzsum(const int n, // AMAX int cblas_isamax(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1126,8 +1134,8 @@ int cblas_isamax(const int n, } int cblas_idamax(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1148,8 +1156,8 @@ int cblas_idamax(const int n, } int cblas_icamax(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1170,8 +1178,8 @@ int cblas_icamax(const int n, } int cblas_izamax(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1194,8 +1202,8 @@ int cblas_izamax(const int n, // AMIN int cblas_isamin(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1216,8 +1224,8 @@ int cblas_isamin(const int n, } int cblas_idamin(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1238,8 +1246,8 @@ int cblas_idamin(const int n, } int cblas_icamin(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1260,8 +1268,8 @@ int cblas_icamin(const int n, } int cblas_izamin(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1284,8 +1292,8 @@ int cblas_izamin(const int n, // MAX int cblas_ismax(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1306,8 +1314,8 @@ int cblas_ismax(const int n, } int cblas_idmax(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1328,8 +1336,8 @@ int cblas_idmax(const int n, } int cblas_icmax(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1350,8 +1358,8 @@ int cblas_icmax(const int n, } int cblas_izmax(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imax_size = 1; @@ -1374,8 +1382,8 @@ int cblas_izmax(const int n, // MIN int cblas_ismin(const int n, const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1396,8 +1404,8 @@ int cblas_ismin(const int n, } int cblas_idmin(const int n, const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1418,8 +1426,8 @@ int cblas_idmin(const int n, } int cblas_icmin(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1440,8 +1448,8 @@ int cblas_icmin(const int n, } int cblas_izmin(const int n, const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n * x_inc; const auto imin_size = 1; @@ -1473,8 +1481,8 @@ void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -1509,8 +1517,8 @@ void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -1545,8 +1553,8 @@ void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; @@ -1581,8 +1589,8 @@ void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; @@ -1619,8 +1627,8 @@ void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -1655,8 +1663,8 @@ void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -1691,8 +1699,8 @@ void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; @@ -1727,8 +1735,8 @@ void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; @@ -1765,8 +1773,8 @@ void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; @@ -1801,8 +1809,8 @@ void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; @@ -1839,8 +1847,8 @@ void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; @@ -1875,8 +1883,8 @@ void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; @@ -1913,8 +1921,8 @@ void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; @@ -1949,8 +1957,8 @@ void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* beta, void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; @@ -1987,8 +1995,8 @@ void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -2023,8 +2031,8 @@ void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -2061,8 +2069,8 @@ void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -2097,8 +2105,8 @@ void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -2135,8 +2143,8 @@ void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, const float beta, float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -2171,8 +2179,8 @@ void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -2206,8 +2214,8 @@ void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const float* a, const int a_ld, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2233,8 +2241,8 @@ void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const double* a, const int a_ld, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2260,8 +2268,8 @@ void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2287,8 +2295,8 @@ void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2316,8 +2324,8 @@ void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2343,8 +2351,8 @@ void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2370,8 +2378,8 @@ void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2397,8 +2405,8 @@ void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2426,8 +2434,8 @@ void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const float* ap, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2453,8 +2461,8 @@ void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const double* ap, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2480,8 +2488,8 @@ void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* ap, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2507,8 +2515,8 @@ void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* ap, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2536,8 +2544,8 @@ void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const float* a, const int a_ld, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2563,8 +2571,8 @@ void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const double* a, const int a_ld, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2590,8 +2598,8 @@ void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2617,8 +2625,8 @@ void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2646,8 +2654,8 @@ void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2673,8 +2681,8 @@ void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2700,8 +2708,8 @@ void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2727,8 +2735,8 @@ void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; @@ -2756,8 +2764,8 @@ void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const float* ap, float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2783,8 +2791,8 @@ void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const double* ap, double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2810,8 +2818,8 @@ void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* ap, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2837,8 +2845,8 @@ void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const int n, const void* ap, void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; @@ -2868,8 +2876,8 @@ void cblas_sger(const CLBlastLayout layout, const float* x, const int x_inc, const float* y, const int y_inc, float* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; @@ -2900,8 +2908,8 @@ void cblas_dger(const CLBlastLayout layout, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; @@ -2934,8 +2942,8 @@ void cblas_cgeru(const CLBlastLayout layout, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto x_size = m * x_inc; @@ -2966,8 +2974,8 @@ void cblas_zgeru(const CLBlastLayout layout, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto x_size = m * x_inc; @@ -3000,8 +3008,8 @@ void cblas_cgerc(const CLBlastLayout layout, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto x_size = m * x_inc; @@ -3032,8 +3040,8 @@ void cblas_zgerc(const CLBlastLayout layout, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto x_size = m * x_inc; @@ -3065,8 +3073,8 @@ void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const float alpha, const void* x, const int x_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3093,8 +3101,8 @@ void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const double alpha, const void* x, const int x_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3123,8 +3131,8 @@ void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const float alpha, const void* x, const int x_inc, void* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3151,8 +3159,8 @@ void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const double alpha, const void* x, const int x_inc, void* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3182,8 +3190,8 @@ void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto x_size = n * x_inc; @@ -3215,8 +3223,8 @@ void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto x_size = n * x_inc; @@ -3250,8 +3258,8 @@ void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* y, const int y_inc, void* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto x_size = n * x_inc; @@ -3283,8 +3291,8 @@ void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, const void* y, const int y_inc, void* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto x_size = n * x_inc; @@ -3317,8 +3325,8 @@ void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const float alpha, const float* x, const int x_inc, float* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3345,8 +3353,8 @@ void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const double alpha, const double* x, const int x_inc, double* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3375,8 +3383,8 @@ void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const float alpha, const float* x, const int x_inc, float* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3403,8 +3411,8 @@ void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const double alpha, const double* x, const int x_inc, double* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3434,8 +3442,8 @@ void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, const float* y, const int y_inc, float* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3467,8 +3475,8 @@ void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3502,8 +3510,8 @@ void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, const float* y, const int y_inc, float* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3535,8 +3543,8 @@ void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, const double* y, const int y_inc, double* ap) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; @@ -3575,8 +3583,8 @@ void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const float* b, const int b_ld, const float beta, float* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -3612,8 +3620,8 @@ void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const double* b, const int b_ld, const double beta, double* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -3649,8 +3657,8 @@ void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; @@ -3686,8 +3694,8 @@ void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; @@ -3725,8 +3733,8 @@ void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const float* b, const int b_ld, const float beta, float* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -3762,8 +3770,8 @@ void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const double* b, const int b_ld, const double beta, double* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -3799,8 +3807,8 @@ void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; @@ -3836,8 +3844,8 @@ void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; @@ -3875,8 +3883,8 @@ void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; @@ -3912,8 +3920,8 @@ void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; @@ -3950,8 +3958,8 @@ void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* a, const int a_ld, const float beta, float* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -3982,8 +3990,8 @@ void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* a, const int a_ld, const double beta, double* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4014,8 +4022,8 @@ void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; @@ -4046,8 +4054,8 @@ void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; @@ -4080,8 +4088,8 @@ void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, const float beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4112,8 +4120,8 @@ void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, const double beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4147,8 +4155,8 @@ void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const float* b, const int b_ld, const float beta, float* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4184,8 +4192,8 @@ void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const double* b, const int b_ld, const double beta, double* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4221,8 +4229,8 @@ void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; @@ -4258,8 +4266,8 @@ void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* b, const int b_ld, const void* beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; @@ -4297,8 +4305,8 @@ void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* b, const int b_ld, const float beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = beta; @@ -4334,8 +4342,8 @@ void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* b, const int b_ld, const double beta, void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = beta; @@ -4371,8 +4379,8 @@ void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const float alpha, const float* a, const int a_ld, float* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4402,8 +4410,8 @@ void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const double alpha, const double* a, const int a_ld, double* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4433,8 +4441,8 @@ void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4464,8 +4472,8 @@ void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4497,8 +4505,8 @@ void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const float alpha, const float* a, const int a_ld, float* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4528,8 +4536,8 @@ void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const double alpha, const double* a, const int a_ld, double* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4559,8 +4567,8 @@ void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4590,8 +4598,8 @@ void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; @@ -4628,8 +4636,8 @@ void cblas_shad(const int n, const float* y, const int y_inc, const float beta, float* z, const int z_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4661,8 +4669,8 @@ void cblas_dhad(const int n, const double* y, const int y_inc, const double beta, double* z, const int z_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; @@ -4694,8 +4702,8 @@ void cblas_chad(const int n, const void* y, const int y_inc, const void* beta, void* z, const int z_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; @@ -4727,8 +4735,8 @@ void cblas_zhad(const int n, const void* y, const int y_inc, const void* beta, void* z, const int z_inc) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; @@ -4761,8 +4769,8 @@ void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const float alpha, const float* a, const int a_ld, float* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; @@ -4789,8 +4797,8 @@ void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const double alpha, const double* a, const int a_ld, double* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; @@ -4817,8 +4825,8 @@ void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; @@ -4845,8 +4853,8 @@ void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const void* alpha, const void* a, const int a_ld, void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; @@ -4873,8 +4881,8 @@ void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp void cblas_sim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const float* im, float* col) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto im_size = height * width * channels; const auto col_size = height * width * channels; @@ -4895,8 +4903,8 @@ void cblas_sim2col(const int channels, const int height, const int width, const void cblas_dim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const double* im, double* col) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto im_size = height * width * channels; const auto col_size = height * width * channels; @@ -4917,8 +4925,8 @@ void cblas_dim2col(const int channels, const int height, const int width, const void cblas_cim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const void* im, void* col) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto im_size = height * width * channels; const auto col_size = height * width * channels; @@ -4939,8 +4947,8 @@ void cblas_cim2col(const int channels, const int height, const int width, const void cblas_zim2col(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const void* im, void* col) { - auto device = get_device(); - auto context = clblast::Context(device); + OPTIONAL_STATIC auto device = get_device(); + OPTIONAL_STATIC auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto im_size = height * width * channels; const auto col_size = height * width * channels; diff --git a/src/clpp11.hpp b/src/clpp11.hpp index 8ac0523f..94464990 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -447,8 +447,14 @@ class Program { // Source-based constructor with memory management explicit Program(const Context &context, const std::string &source) { - const char *source_ptr = &source[0]; - const auto length = source.length(); + #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND + const std::string source_null_kernel = source + "\n__kernel void null_kernel() {}\n"; + const char *source_ptr = &source_null_kernel[0]; + const auto length = source_null_kernel.length(); + #else + const char *source_ptr = &source[0]; + const auto length = source.length(); + #endif auto status = CL_SUCCESS; program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status); CLCudaAPIError::Check(status, "clCreateProgramWithSource"); @@ -723,9 +729,10 @@ class Buffer { } // Copies the contents of this buffer into another device buffer - void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const { + void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination, + EventPointer event = nullptr) const { CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0, - nullptr, nullptr)); + nullptr, event)); } void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const { CopyToAsync(queue, size, destination); @@ -764,10 +771,21 @@ class Kernel { kernel_(new cl_kernel, [](cl_kernel* k) { if (*k) { CheckErrorDtor(clReleaseKernel(*k)); } delete k; - }) { + }) + #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND + , null_kernel_(new cl_kernel, [](cl_kernel* k) { + if (*k) { CheckErrorDtor(clReleaseKernel(*k)); } + delete k; + }) + #endif + { auto status = CL_SUCCESS; *kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status); CLCudaAPIError::Check(status, "clCreateKernel"); + #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND + *null_kernel_ = clCreateKernel(program->operator()(), "null_kernel", &status); + CLCudaAPIError::Check(status, "clCreateKernel"); + #endif } // Sets a kernel argument at the indicated position @@ -831,12 +849,21 @@ class Kernel { static_cast<cl_uint>(waitForEventsPlain.size()), !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr, event)); + #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND + const std::vector<size_t> nullRange = {1}; + CheckError(clEnqueueNDRangeKernel(queue(), *null_kernel_, static_cast<cl_uint>(nullRange.size()), + nullptr, nullRange.data(), nullptr, + 0, nullptr, nullptr)); + #endif } // Accessor to the private data-member const cl_kernel& operator()() const { return *kernel_; } private: std::shared_ptr<cl_kernel> kernel_; + #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND + std::shared_ptr<cl_kernel> null_kernel_; + #endif // Internal implementation for the recursive SetArguments function. template <typename T> diff --git a/src/database/kernels/copy/copy_3232.hpp b/src/database/kernels/copy/copy_3232.hpp index 64d56a7b..83ba8106 100644 --- a/src/database/kernels/copy/copy_3232.hpp +++ b/src/database/kernels/copy/copy_3232.hpp @@ -88,6 +88,7 @@ const DatabaseEntry CopyComplexSingle = { { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 16, 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 32, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, diff --git a/src/database/kernels/gemm_routine/gemm_routine_32.hpp b/src/database/kernels/gemm_routine/gemm_routine_32.hpp index b685d4bc..ba0cc5a3 100644 --- a/src/database/kernels/gemm_routine/gemm_routine_32.hpp +++ b/src/database/kernels/gemm_routine/gemm_routine_32.hpp @@ -33,6 +33,7 @@ const DatabaseEntry GemmRoutineSingle = { kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, @@ -62,7 +63,7 @@ const DatabaseEntry GemmRoutineSingle = { { // Default kDeviceTypeAll, "default", { { "default", { - { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp index c72db083..9977bb78 100644 --- a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp +++ b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp @@ -24,6 +24,7 @@ const DatabaseEntry GemmRoutineComplexSingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, @@ -49,7 +50,7 @@ const DatabaseEntry GemmRoutineComplexSingle = { { // Default kDeviceTypeAll, "default", { { "default", { - { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/invert/invert_32.hpp b/src/database/kernels/invert/invert_32.hpp index b3f9143a..d550e3ba 100644 --- a/src/database/kernels/invert/invert_32.hpp +++ b/src/database/kernels/invert/invert_32.hpp @@ -24,6 +24,7 @@ const DatabaseEntry InvertSingle = { kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/invert/invert_3232.hpp b/src/database/kernels/invert/invert_3232.hpp index 11ea895d..d1103909 100644 --- a/src/database/kernels/invert/invert_3232.hpp +++ b/src/database/kernels/invert/invert_3232.hpp @@ -23,6 +23,7 @@ const DatabaseEntry InvertComplexSingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/pad/pad_3232.hpp b/src/database/kernels/pad/pad_3232.hpp index 88ae08a3..08ed21a4 100644 --- a/src/database/kernels/pad/pad_3232.hpp +++ b/src/database/kernels/pad/pad_3232.hpp @@ -88,6 +88,7 @@ const DatabaseEntry PadComplexSingle = { { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 16, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 32, 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, diff --git a/src/database/kernels/padtranspose/padtranspose_3232.hpp b/src/database/kernels/padtranspose/padtranspose_3232.hpp index fb0ec5d0..32506c1e 100644 --- a/src/database/kernels/padtranspose/padtranspose_3232.hpp +++ b/src/database/kernels/padtranspose/padtranspose_3232.hpp @@ -88,10 +88,11 @@ const DatabaseEntry PadtransposeComplexSingle = { { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, - { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/transpose/transpose_3232.hpp b/src/database/kernels/transpose/transpose_3232.hpp index 45b2c3ff..b0de29f1 100644 --- a/src/database/kernels/transpose/transpose_3232.hpp +++ b/src/database/kernels/transpose/transpose_3232.hpp @@ -88,6 +88,7 @@ const DatabaseEntry TransposeComplexSingle = { { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, diff --git a/src/database/kernels/trsv_routine/trsv_routine_32.hpp b/src/database/kernels/trsv_routine/trsv_routine_32.hpp index 2ee82b71..7fc71f5e 100644 --- a/src/database/kernels/trsv_routine/trsv_routine_32.hpp +++ b/src/database/kernels/trsv_routine/trsv_routine_32.hpp @@ -24,6 +24,7 @@ const DatabaseEntry TrsvRoutineSingle = { kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp index 6f2f9306..221f12cf 100644 --- a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp +++ b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp @@ -23,6 +23,7 @@ const DatabaseEntry TrsvRoutineComplexSingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xaxpy/xaxpy_3232.hpp b/src/database/kernels/xaxpy/xaxpy_3232.hpp index 4a29da4d..25cd3630 100644 --- a/src/database/kernels/xaxpy/xaxpy_3232.hpp +++ b/src/database/kernels/xaxpy/xaxpy_3232.hpp @@ -88,6 +88,7 @@ const DatabaseEntry XaxpyComplexSingle = { { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 1, 256, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, diff --git a/src/database/kernels/xdot/xdot_32.hpp b/src/database/kernels/xdot/xdot_32.hpp index a7e685bf..2b7bc724 100644 --- a/src/database/kernels/xdot/xdot_32.hpp +++ b/src/database/kernels/xdot/xdot_32.hpp @@ -84,7 +84,7 @@ const DatabaseEntry XdotSingle = { { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 512, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, - { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, diff --git a/src/database/kernels/xdot/xdot_3232.hpp b/src/database/kernels/xdot/xdot_3232.hpp index ad2cf414..e655e17e 100644 --- a/src/database/kernels/xdot/xdot_3232.hpp +++ b/src/database/kernels/xdot/xdot_3232.hpp @@ -82,6 +82,7 @@ const DatabaseEntry XdotComplexSingle = { { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 32, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, diff --git a/src/database/kernels/xgemm/xgemm_32.hpp b/src/database/kernels/xgemm/xgemm_32.hpp index 32358dbc..d75758ea 100644 --- a/src/database/kernels/xgemm/xgemm_32.hpp +++ b/src/database/kernels/xgemm/xgemm_32.hpp @@ -90,7 +90,7 @@ const DatabaseEntry XgemmSingle = { { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 1, 4, 1, 1, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 0, 1, 32, 2, 32, 8, 64, 16, 16, 128, 0, 0, 0, 1, 1, 2 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 16, 128, 1, 1, 1, 1, 2, 4 } }, - { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 16, 128, 1, 1, 0, 1, 1, 4 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 2, 1, 1, 4, 4, 32, 8, 8, 64, 0, 0, 0, 0, 2, 2 } }, { Name{"Iris "}, Params{ 0, 1, 16, 8, 16, 8, 128, 32, 16, 64, 1, 1, 1, 1, 4, 1 } }, { Name{"Iris Pro "}, Params{ 0, 1, 16, 2, 16, 8, 64, 32, 32, 128, 1, 1, 1, 0, 4, 4 } }, diff --git a/src/database/kernels/xgemm/xgemm_3232.hpp b/src/database/kernels/xgemm/xgemm_3232.hpp index 22959347..9c0b70b1 100644 --- a/src/database/kernels/xgemm/xgemm_3232.hpp +++ b/src/database/kernels/xgemm/xgemm_3232.hpp @@ -88,10 +88,11 @@ const DatabaseEntry XgemmComplexSingle = { { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 0, 1, 16, 8, 8, 8, 32, 16, 16, 64, 1, 0, 0, 0, 4, 4 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 0, 1, 16, 2, 16, 8, 32, 8, 8, 32, 0, 0, 1, 0, 1, 1 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 0, 1, 32, 8, 16, 16, 64, 16, 16, 64, 1, 1, 1, 1, 2, 1 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 4, 1, 1, 32, 32, 128, 16, 16, 128, 0, 0, 0, 0, 4, 1 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 2, 1, 1, 4, 4, 16, 8, 8, 64, 0, 0, 0, 0, 2, 2 } }, { Name{"Iris "}, Params{ 0, 1, 32, 8, 32, 16, 64, 8, 16, 64, 1, 0, 1, 0, 1, 1 } }, { Name{"Iris Pro "}, Params{ 0, 1, 16, 2, 8, 8, 32, 32, 8, 32, 1, 1, 1, 1, 1, 1 } }, - { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 32, 32, 128, 1, 1, 1, 0, 2, 2 } }, + { kDeviceNameDefault , Params{ 0, 1, 16, 2, 16, 8, 32, 8, 8, 32, 0, 0, 1, 0, 1, 1 } }, } }, } }, diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp index f6ea9523..146018d5 100644 --- a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp +++ b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp @@ -69,7 +69,7 @@ const DatabaseEntry XgemmDirectSingle = { { "default", { { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, - { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 16, 8, 16, 16, 1, 0, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 8, 16, 32, 16, 8, 1, 0, 1, 1, 64, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 4, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp index 8f24ee7d..6dd95b38 100644 --- a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp +++ b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp @@ -63,9 +63,10 @@ const DatabaseEntry XgemmDirectComplexSingle = { kDeviceTypeGPU, "Intel", { { "default", { { Name{"Intel(R) HD Graphics 620 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 32, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0, 0, 0 } }, - { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/xgemv/xgemv_3232.hpp b/src/database/kernels/xgemv/xgemv_3232.hpp index dc9a0a88..2b4328b3 100644 --- a/src/database/kernels/xgemv/xgemv_3232.hpp +++ b/src/database/kernels/xgemv/xgemv_3232.hpp @@ -86,6 +86,7 @@ const DatabaseEntry XgemvComplexSingle = { { Name{"Intel(R) HD Graphics 530 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp index 146bd466..351973ae 100644 --- a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp +++ b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp @@ -92,7 +92,7 @@ const DatabaseEntry XgemvFastSingle = { { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 4, 64, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, - { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp index 693fac4e..e9928b28 100644 --- a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp +++ b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp @@ -86,6 +86,7 @@ const DatabaseEntry XgemvFastComplexSingle = { { Name{"Intel(R) HD Graphics 530 "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 1, 32, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 2, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp index 42e7a36d..cf1b4e55 100644 --- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp +++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp @@ -70,7 +70,7 @@ const DatabaseEntry XgemvFastRotSingle = { { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, - { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 128, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp index 98d5cf6a..bf780835 100644 --- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp +++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp @@ -68,6 +68,7 @@ const DatabaseEntry XgemvFastRotComplexSingle = { { "default", { { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 2, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 4, 128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 4, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, diff --git a/src/database/kernels/xger/xger_3232.hpp b/src/database/kernels/xger/xger_3232.hpp index 123fc4fa..763f2ca6 100644 --- a/src/database/kernels/xger/xger_3232.hpp +++ b/src/database/kernels/xger/xger_3232.hpp @@ -87,6 +87,7 @@ const DatabaseEntry XgerComplexSingle = { { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 620 "}, Params{ 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile "}, Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Intel(R) HD Graphics IvyBridge M GT2 "}, Params{ 16, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Iris Pro "}, Params{ 16, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, diff --git a/src/kernel_preprocessor.cpp b/src/kernel_preprocessor.cpp index aa946bab..1c422d33 100644 --- a/src/kernel_preprocessor.cpp +++ b/src/kernel_preprocessor.cpp @@ -557,6 +557,8 @@ std::string PreprocessKernelSource(const std::string& kernel_source) { lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers); lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false); lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false); + lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false); + lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, false); lines = PreprocessUnrollLoops(lines, defines, arrays_to_registers, true); // Gather the results diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 3cfc5dfb..80a60107 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -43,8 +43,6 @@ // literal). Comment-out this line for syntax-highlighting when developing. R"( -// ================================================================================================= - // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. #ifndef GEMMK @@ -397,9 +395,7 @@ INLINE_FUNC realN LocalToPrivateB(LOCAL_PTR realN* blm, const int _ni, const int } #endif -// ================================================================================================= - -// End of the C++11 raw string literal )" +// End of the C++11 raw string literal // ================================================================================================= diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index 17c8955a..ee4d5da5 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -15,8 +15,6 @@ // literal). Comment-out this line for syntax-highlighting when developing. R"( -// ================================================================================================= - // The vectorised multiply-add function INLINE_FUNC realM MultiplyAddVector(realM cvec, const realM avec, const real bval) { #if USE_VECTOR_MAD == 1 @@ -171,9 +169,7 @@ INLINE_FUNC void StoreResults(__global realM* cgm, realM c_value, const int _mi, cgm[index] = result; } -// ================================================================================================= - -// End of the C++11 raw string literal )" +// End of the C++11 raw string literal // ================================================================================================= diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index 90de0b3b..77964a94 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -15,14 +15,12 @@ // literal). Comment-out this line for syntax-highlighting when developing. R"( -// ================================================================================================= - // A common interface for subgroup functions #if USE_SUBGROUP_SHUFFLING == 1 INLINE_FUNC int clblast_get_sub_group_local_id() { - + // Intel extension #if SUBGROUP_SHUFFLING_INTEL == 1 return get_sub_group_local_id(); @@ -36,7 +34,7 @@ INLINE_FUNC int clblast_get_sub_group_local_id() { } INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) { - + // Intel extension #if SUBGROUP_SHUFFLING_INTEL == 1 return intel_sub_group_shuffle(reg, src); @@ -238,48 +236,47 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, for (int _mi = 0; _mi < MWI/VWM; _mi += 1) { #pragma unroll for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { - const int index = _ni * (MWI/VWM) + _mi; #if USE_SUBGROUP_SHUFFLING == 1 const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni); #else const realN aval = apm[_ni * (KREG/VWN) + _ki]; #endif #if VWN == 1 - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval); #elif VWN == 2 - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y); #elif VWN == 4 - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.z); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.w); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.x); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.y); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.z); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.w); #elif VWN == 8 - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.s0); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.s1); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.s2); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.s3); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 4) * (MWI/VWM) + _mi], aval.s4); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 5) * (MWI/VWM) + _mi], aval.s5); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 6) * (MWI/VWM) + _mi], aval.s6); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 7) * (MWI/VWM) + _mi], aval.s7); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval.s0); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1) * (MWI/VWM) + _mi], aval.s1); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2) * (MWI/VWM) + _mi], aval.s2); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3) * (MWI/VWM) + _mi], aval.s3); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 4) * (MWI/VWM) + _mi], aval.s4); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 5) * (MWI/VWM) + _mi], aval.s5); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 6) * (MWI/VWM) + _mi], aval.s6); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 7) * (MWI/VWM) + _mi], aval.s7); #elif VWN == 16 - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0 ) * (MWI/VWM) + _mi], aval.s0); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 1 ) * (MWI/VWM) + _mi], aval.s1); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 2 ) * (MWI/VWM) + _mi], aval.s2); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 3 ) * (MWI/VWM) + _mi], aval.s3); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 4 ) * (MWI/VWM) + _mi], aval.s4); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 5 ) * (MWI/VWM) + _mi], aval.s5); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 6 ) * (MWI/VWM) + _mi], aval.s6); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 7 ) * (MWI/VWM) + _mi], aval.s7); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 8 ) * (MWI/VWM) + _mi], aval.s8); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 9 ) * (MWI/VWM) + _mi], aval.s9); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 10) * (MWI/VWM) + _mi], aval.sA); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 11) * (MWI/VWM) + _mi], aval.sB); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 12) * (MWI/VWM) + _mi], aval.sC); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 13) * (MWI/VWM) + _mi], aval.sD); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 14) * (MWI/VWM) + _mi], aval.sE); - cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 15) * (MWI/VWM) + _mi], aval.sF); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 0 ) * (MWI/VWM) + _mi], aval.s0); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 1 ) * (MWI/VWM) + _mi], aval.s1); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 2 ) * (MWI/VWM) + _mi], aval.s2); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 3 ) * (MWI/VWM) + _mi], aval.s3); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 4 ) * (MWI/VWM) + _mi], aval.s4); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 5 ) * (MWI/VWM) + _mi], aval.s5); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 6 ) * (MWI/VWM) + _mi], aval.s6); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 7 ) * (MWI/VWM) + _mi], aval.s7); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 8 ) * (MWI/VWM) + _mi], aval.s8); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 9 ) * (MWI/VWM) + _mi], aval.s9); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 10) * (MWI/VWM) + _mi], aval.sA); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 11) * (MWI/VWM) + _mi], aval.sB); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 12) * (MWI/VWM) + _mi], aval.sC); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 13) * (MWI/VWM) + _mi], aval.sD); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 14) * (MWI/VWM) + _mi], aval.sE); + cpm[_ni * (MWI/VWM) + _mi] = MultiplyAddVector(cpm[_ni * (MWI/VWM) + _mi], bpm[(VWN * _ki + 15) * (MWI/VWM) + _mi], aval.sF); #endif } } @@ -311,9 +308,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, } } -// ================================================================================================= - -// End of the C++11 raw string literal )" +// End of the C++11 raw string literal // ================================================================================================= diff --git a/src/kernels/level3/xgemm_part4.opencl b/src/kernels/level3/xgemm_part4.opencl index e581cd84..b1f1ade6 100644 --- a/src/kernels/level3/xgemm_part4.opencl +++ b/src/kernels/level3/xgemm_part4.opencl @@ -15,7 +15,6 @@ // literal). Comment-out this line for syntax-highlighting when developing. R"( -// ================================================================================================= // The upper-triangular and lower-triangular kernels are only used in special cases #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) @@ -132,9 +131,8 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, } #endif -// ================================================================================================= -// End of the C++11 raw string literal )" +// End of the C++11 raw string literal // ================================================================================================= diff --git a/src/routines/common.hpp b/src/routines/common.hpp index c30a2e0e..c6db0152 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -76,6 +76,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, // Determines the right kernel auto kernel_name = std::string{}; + auto pad_kernel = false; if (do_transpose) { if (use_fast_kernel && IsMultiple(src_ld, db["TRA_WPT"]) && @@ -85,7 +86,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, } else { use_fast_kernel = false; - kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix"; + pad_kernel = (do_pad || do_conjugate); + kernel_name = (pad_kernel) ? "TransposePadMatrix" : "TransposeMatrix"; } } else { @@ -97,7 +99,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, } else { use_fast_kernel = false; - kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix"; + pad_kernel = do_pad; + kernel_name = (pad_kernel) ? "CopyPadMatrix" : "CopyMatrix"; } } @@ -123,7 +126,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, kernel.SetArgument(8, static_cast<int>(dest_offset)); kernel.SetArgument(9, dest()); kernel.SetArgument(10, GetRealArg(alpha)); - if (do_pad) { + if (pad_kernel) { kernel.SetArgument(11, static_cast<int>(do_conjugate)); } else { diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp index 76401753..2a5a5664 100644 --- a/src/routines/level2/xtrsv.cpp +++ b/src/routines/level2/xtrsv.cpp @@ -33,7 +33,8 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle, const size_t n, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + EventPointer event) { if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); }; @@ -69,9 +70,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle, // Launches the kernel const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]}; const auto global = std::vector<size_t>{Ceil(n, db_["TRSV_BLOCK_SIZE"])}; - auto event = Event(); - RunKernel(kernel, queue_, device_, global, local, event.pointer()); - event.WaitForCompletion(); + RunKernel(kernel, queue_, device_, global, local, event); } // ================================================================================================= @@ -146,14 +145,16 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle, } // Runs the triangular substitution for the block size + auto sub_event = Event(); Substitution(layout, triangle, a_transpose, diagonal, block_size, a_buffer, a_offset + col + col*a_ld, a_ld, b_buffer, b_offset + col*b_inc, b_inc, - x_buffer, x_offset + col*x_inc, x_inc); + x_buffer, x_offset + col*x_inc, x_inc, sub_event.pointer()); + sub_event.WaitForCompletion(); } // Retrieves the results - x_buffer.CopyTo(queue_, x_size, b_buffer); + x_buffer.CopyToAsync(queue_, x_size, b_buffer, event_); } // ================================================================================================= diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp index 67e626a1..8a900a35 100644 --- a/src/routines/level2/xtrsv.hpp +++ b/src/routines/level2/xtrsv.hpp @@ -32,6 +32,7 @@ class Xtrsv: public Xgemv<T> { using Xgemv<T>::device_; using Xgemv<T>::db_; using Xgemv<T>::program_; + using Xgemv<T>::event_; using Xgemv<T>::DoGemv; // Constructor @@ -50,7 +51,8 @@ class Xtrsv: public Xgemv<T> { const size_t n, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc, - const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc); + const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc, + EventPointer event); }; // ================================================================================================= diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index fd5a20db..cb24460a 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -40,6 +40,7 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name): , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" }) { diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index 6912d3a9..2e6f30ec 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -32,6 +32,7 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name): , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" }) { diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index 6bb2a24f..5ffdc028 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -32,6 +32,7 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name): , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" }) { diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp index 905660ff..fe5d1e14 100644 --- a/src/routines/level3/xtrsm.cpp +++ b/src/routines/level3/xtrsm.cpp @@ -246,7 +246,7 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle, } // Retrieves the results - x_buffer.CopyTo(queue_, b_size, b_buffer); + x_buffer.CopyToAsync(queue_, b_size, b_buffer, event_); } // ================================================================================================= diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp index 5b42398e..871d7253 100644 --- a/src/routines/level3/xtrsm.hpp +++ b/src/routines/level3/xtrsm.hpp @@ -31,6 +31,7 @@ class Xtrsm: public Xgemm<T> { using Xgemm<T>::device_; using Xgemm<T>::db_; using Xgemm<T>::program_; + using Xgemm<T>::event_; using Xgemm<T>::DoGemm; // Constructor diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp index 2bbc5007..b12b8734 100644 --- a/src/routines/levelx/xgemmbatched.cpp +++ b/src/routines/levelx/xgemmbatched.cpp @@ -38,6 +38,7 @@ XgemmBatched<T>::XgemmBatched(Queue &queue, EventPointer event, const std::strin , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 diff --git a/src/routines/levelx/xgemmstridedbatched.cpp b/src/routines/levelx/xgemmstridedbatched.cpp index 30c161cc..d9e3ebba 100644 --- a/src/routines/levelx/xgemmstridedbatched.cpp +++ b/src/routines/levelx/xgemmstridedbatched.cpp @@ -37,6 +37,7 @@ XgemmStridedBatched<T>::XgemmStridedBatched(Queue &queue, EventPointer event, co , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 diff --git a/src/tuning/configurations.cpp b/src/tuning/configurations.cpp index 1fe232cf..82d7e3b4 100644 --- a/src/tuning/configurations.cpp +++ b/src/tuning/configurations.cpp @@ -23,28 +23,42 @@ namespace clblast { // Finds all configurations. It also applies the user-defined constraints within. std::vector<Configuration> SetConfigurations(const Device& device, const std::vector<Parameter> parameters, + const std::vector<size_t>& local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, const Constraints& constraints, const LocalMemSizeInfo& local_mem_size_info) { const auto local_mem_max = device.LocalMemSize(); + const auto max_work_item_sizes = device.MaxWorkItemSizes(); + const auto max_work_group_size = device.MaxWorkGroupSize(); auto config = Configuration(); auto configurations = std::vector<Configuration>(); - PopulateConfigurations(parameters, 0, config, configurations, - local_mem_max, constraints, local_mem_size_info); + PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config, + 0, config, configurations, + local_mem_max, constraints, local_mem_size_info, + max_work_item_sizes, max_work_group_size); return configurations; } // Iterates recursively over all permutations of the user-defined parameters void PopulateConfigurations(const std::vector<Parameter> ¶meters, + const std::vector<size_t> local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, const size_t index, const Configuration &config, std::vector<Configuration> &configuration, const size_t local_mem_max, const Constraints& constraints, - const LocalMemSizeInfo& local_mem_size_info) { + const LocalMemSizeInfo& local_mem_size_info, + const std::vector<size_t>& max_work_item_sizes, + const size_t max_work_group_size) { // End of the chain: all parameters are considered, store the resulting configuration if it is a // valid one according to the constraints if (index == parameters.size()) { - if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info)) { + if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info, + local_size_base, mul_local_config, div_local_config, + max_work_item_sizes, max_work_group_size)) { configuration.push_back(config); } return; @@ -55,8 +69,10 @@ void PopulateConfigurations(const std::vector<Parameter> ¶meters, for (auto &value: parameter.second) { auto config_copy = config; config_copy[parameter.first] = value; - PopulateConfigurations(parameters, index+1, config_copy, configuration, - local_mem_max, constraints, local_mem_size_info); + PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config, + index+1, config_copy, configuration, + local_mem_max, constraints, local_mem_size_info, + max_work_item_sizes, max_work_group_size); } } @@ -64,7 +80,12 @@ void PopulateConfigurations(const std::vector<Parameter> ¶meters, bool ValidConfiguration(const Configuration &config, const size_t local_mem_max, const Constraints& constraints, - const LocalMemSizeInfo& local_mem_size_info) { + const LocalMemSizeInfo& local_mem_size_info, + const std::vector<size_t> local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, + const std::vector<size_t>& max_work_item_sizes, + const size_t max_work_group_size) { // Iterates over all constraints for (auto &constraint: constraints) { @@ -92,6 +113,20 @@ bool ValidConfiguration(const Configuration &config, return false; } + // Checks the local thread size (both per dimension and in total) + const auto local = SetThreadConfiguration(config, local_size_base, + mul_local_config, div_local_config); + for (auto i=size_t{0}; i<local.size(); ++i) { + if (local[i] > max_work_item_sizes[i]) { + return false; + } + } + auto local_size = size_t{1}; + for (auto &item: local) { local_size *= item; } + if (local_size > max_work_group_size) { + return false; + } + // Everything was OK: this configuration is valid return true; } diff --git a/src/tuning/configurations.hpp b/src/tuning/configurations.hpp index faa5498f..4b9ba93f 100644 --- a/src/tuning/configurations.hpp +++ b/src/tuning/configurations.hpp @@ -50,6 +50,9 @@ struct LocalMemSizeInfo { // function to find all configurations. It also applies the user-defined constraints within. std::vector<Configuration> SetConfigurations(const Device& device, const std::vector<Parameter> parameters, + const std::vector<size_t>& local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, const Constraints& constraints, const LocalMemSizeInfo& local_mem_size_info); @@ -58,11 +61,16 @@ std::vector<Configuration> SetConfigurations(const Device& device, // At the end of each chain (when all parameters are considered), the function stores the result // into the configuration list. void PopulateConfigurations(const std::vector<Parameter> ¶meters, + const std::vector<size_t> local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, const size_t index, const Configuration &config, std::vector<Configuration> &configuration, const size_t local_mem_max, const Constraints& constraints, - const LocalMemSizeInfo& local_mem_size_info); + const LocalMemSizeInfo& local_mem_size_info, + const std::vector<size_t>& max_work_item_sizes, + const size_t max_work_group_size); // Loops over all user-defined constraints to check whether or not the configuration is valid. // Assumes initially all configurations are valid, then returns false if one of the constraints has @@ -71,7 +79,12 @@ void PopulateConfigurations(const std::vector<Parameter> ¶meters, bool ValidConfiguration(const Configuration &config, const size_t local_mem_max, const Constraints& constraints, - const LocalMemSizeInfo& local_mem_size_info); + const LocalMemSizeInfo& local_mem_size_info, + const std::vector<size_t> local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, + const std::vector<size_t>& max_work_item_sizes, + const size_t max_work_group_size); // Processes multipliers and dividers to obtain the final thread configuration std::vector<size_t> SetThreadConfiguration(const Configuration& config, diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 75e776e6..dd907ba4 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -33,9 +33,13 @@ void StartVariation(int argc, char *argv[]) { // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { + printf("* (1/4) Tuning main GEMM kernel (GEMMK == 0) for fixed set of parameters\n\n"); StartVariation<1>(argc, argv); + printf("* (2/4) Tuning main GEMM kernel (GEMMK == 0) for random parameters out of larger set\n\n"); StartVariation<2>(argc, argv); + printf("* (3/4) Tuning secondary GEMM kernel (GEMMK == 1) for fixed set of parameters\n\n"); StartVariation<11>(argc, argv); + printf("* (4/4) Tuning secondary GEMM kernel (GEMMK == 1) for random parameters out of larger set\n\n"); StartVariation<12>(argc, argv); return 0; } diff --git a/src/tuning/kernels/xgemm.hpp b/src/tuning/kernels/xgemm.hpp index 9a538c1b..fa1bb6ec 100644 --- a/src/tuning/kernels/xgemm.hpp +++ b/src/tuning/kernels/xgemm.hpp @@ -50,6 +50,8 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments<T> &args) { settings.sources += #include "../src/kernels/level3/xgemm_part1.opencl" #include "../src/kernels/level3/xgemm_part2.opencl" + ; + settings.sources += #include "../src/kernels/level3/xgemm_part3.opencl" #include "../src/kernels/level3/xgemm_part4.opencl" ; diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp index 822f8851..d382fb18 100644 --- a/src/tuning/tuning.cpp +++ b/src/tuning/tuning.cpp @@ -172,7 +172,8 @@ void Tuner(int argc, char* argv[], const int V, } // Sets the tunable parameters and their possible values - auto configurations = SetConfigurations(device, settings.parameters, + auto configurations = SetConfigurations(device, settings.parameters, settings.local_size, + settings.mul_local, settings.div_local, SetConstraints(V), ComputeLocalMemSize(V)); printf("* Found %s%zu configuration(s)%s\n", kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str()); diff --git a/src/tuning/tuning_api.cpp b/src/tuning/tuning_api.cpp index 2eec2e2e..2cc9b786 100644 --- a/src/tuning/tuning_api.cpp +++ b/src/tuning/tuning_api.cpp @@ -264,7 +264,8 @@ StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V, } // Sets the tunable parameters and their possible values - auto configurations = SetConfigurations(device, settings.parameters, + auto configurations = SetConfigurations(device, settings.parameters, settings.local_size, + settings.mul_local, settings.div_local, SetConstraints(V), ComputeLocalMemSize(V)); // Select the search method (full search or a random fraction) diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp index 835f54b4..00cb90cb 100644 --- a/src/utilities/compile.cpp +++ b/src/utilities/compile.cpp @@ -59,7 +59,8 @@ std::shared_ptr<Program> CompileFromSource( } // For Intel GPUs with subgroup support, use subgroup shuffling. - if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) { + if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups) && + (precision == Precision::kSingle || precision == Precision::kHalf)) { header_string += "#define USE_SUBGROUP_SHUFFLING 1\n"; header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n"; } |