diff options
author | Gard Spreemann <gspr@nonempty.org> | 2021-01-20 15:42:05 +0100 |
---|---|---|
committer | Gard Spreemann <gspr@nonempty.org> | 2021-01-20 15:42:05 +0100 |
commit | c8e17e202f2ac8ab338a1a444b6be37a97e38226 (patch) | |
tree | 9ecce4c2b90103725817782cc4412835ec95fac0 /src | |
parent | b991afa34ae943548146e4f71bbbba8f1f4b1000 (diff) | |
parent | 70016e869881df837402def4904b2888247e02d9 (diff) |
Merge tag '1.5.2' into debian/sid
Diffstat (limited to 'src')
74 files changed, 761 insertions, 34 deletions
diff --git a/src/clpp11.hpp b/src/clpp11.hpp index aeb14989..2a25606c 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -47,6 +47,7 @@ #include <assert.h> // OpenCL +#define CL_TARGET_OPENCL_VERSION 110 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings #define CL_USE_DEPRECATED_OPENCL_2_0_APIS // to disable deprecation warnings @@ -508,12 +509,35 @@ class Program { // Retrieves a binary or an intermediate representation of the compiled program std::string GetIR() const { - auto bytes = size_t{0}; - CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr)); + cl_uint num_devices = 0; + CheckError(clGetProgramInfo(program_, CL_PROGRAM_NUM_DEVICES, + sizeof(cl_uint), &num_devices, nullptr)); + + std::vector<size_t> binSizesInBytes(num_devices, 0); + CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, + num_devices * sizeof(size_t), binSizesInBytes.data(), nullptr)); + + auto bytes = size_t{0}; + auto binSizeIter = size_t{0}; + // Loop over the program binary sizes to find a binary whose size is > 0. + // The current logic assumes that there ever is only one valid program binary + // in a given cl_program. This should be the case unless the cl_program + // is built for all or a subset of devices associated to a given cl_program + for (; binSizeIter < binSizesInBytes.size(); ++binSizeIter) { + if (binSizesInBytes[binSizeIter] > 0) { + bytes = binSizesInBytes[binSizeIter]; + break; + } + } auto result = std::string{}; result.resize(bytes); - auto result_ptr = result.data(); - CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr)); + + std::vector<char*> out(num_devices, nullptr); + out[binSizeIter] = const_cast<char*>(result.data()); + + CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES, + num_devices * sizeof(char*), + out.data(), nullptr)); return result; } diff --git a/src/database/database_structure.hpp b/src/database/database_structure.hpp index 0199ec3a..ec6a16d2 100644 --- a/src/database/database_structure.hpp +++ b/src/database/database_structure.hpp @@ -21,6 +21,7 @@ // Just needed for 'Precision' #ifdef OPENCL_API + #define CL_TARGET_OPENCL_VERSION 110 #include "clblast.h" #elif CUDA_API #include "clblast_cuda.h" diff --git a/src/database/kernels/copy/copy_16.hpp b/src/database/kernels/copy/copy_16.hpp index 564bea6b..a13072f0 100644 --- a/src/database/kernels/copy/copy_16.hpp +++ b/src/database/kernels/copy/copy_16.hpp @@ -18,6 +18,10 @@ const DatabaseEntry CopyHalf = { { Name{"AMD Radeon RX 480 "}, Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/copy/copy_32.hpp b/src/database/kernels/copy/copy_32.hpp index 3ef38ccb..9f24afea 100644 --- a/src/database/kernels/copy/copy_32.hpp +++ b/src/database/kernels/copy/copy_32.hpp @@ -51,6 +51,10 @@ const DatabaseEntry CopySingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -87,6 +91,7 @@ const DatabaseEntry CopySingle = { { // Intel GPUs kDeviceTypeGPU, "Intel", { { "default", { + { Name{"Intel(R) Gen9 HD Graphics NEO "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 530 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -154,6 +159,10 @@ const DatabaseEntry CopySingle = { { Name{"TITAN X (Pascal) "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/copy/copy_3232.hpp b/src/database/kernels/copy/copy_3232.hpp index 81feba9f..08ee3c25 100644 --- a/src/database/kernels/copy/copy_3232.hpp +++ b/src/database/kernels/copy/copy_3232.hpp @@ -51,6 +51,10 @@ const DatabaseEntry CopyComplexSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -149,6 +153,10 @@ const DatabaseEntry CopyComplexSingle = { { Name{"TITAN X (Pascal) "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/copy/copy_64.hpp b/src/database/kernels/copy/copy_64.hpp index 52ce191e..f6ef641e 100644 --- a/src/database/kernels/copy/copy_64.hpp +++ b/src/database/kernels/copy/copy_64.hpp @@ -43,6 +43,10 @@ const DatabaseEntry CopyDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 32, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 32, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -136,8 +140,12 @@ const DatabaseEntry CopyDouble = { { Name{"TITAN X (Pascal) "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { - { kDeviceNameDefault , Params{ 32, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/copy/copy_6464.hpp b/src/database/kernels/copy/copy_6464.hpp index d0e31201..8607efc5 100644 --- a/src/database/kernels/copy/copy_6464.hpp +++ b/src/database/kernels/copy/copy_6464.hpp @@ -43,6 +43,10 @@ const DatabaseEntry CopyComplexDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -136,6 +140,10 @@ const DatabaseEntry CopyComplexDouble = { { Name{"TITAN X (Pascal) "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/gemm_routine/gemm_routine_32.hpp b/src/database/kernels/gemm_routine/gemm_routine_32.hpp index 97b9c5ed..68e18f01 100644 --- a/src/database/kernels/gemm_routine/gemm_routine_32.hpp +++ b/src/database/kernels/gemm_routine/gemm_routine_32.hpp @@ -61,6 +61,10 @@ const DatabaseEntry GemmRoutineSingle = { { Name{"TITAN X (Pascal) "}, Params{ 1664, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp index c3e60fe5..3f5e32bc 100644 --- a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp +++ b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp @@ -48,15 +48,19 @@ const DatabaseEntry GemmRoutineComplexSingle = { { Name{"TITAN X (Pascal) "}, Params{ 1472, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1856, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1856, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { - { kDeviceNameDefault , Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1408, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { - { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/gemm_routine/gemm_routine_64.hpp b/src/database/kernels/gemm_routine/gemm_routine_64.hpp index bd3df89a..036da90f 100644 --- a/src/database/kernels/gemm_routine/gemm_routine_64.hpp +++ b/src/database/kernels/gemm_routine/gemm_routine_64.hpp @@ -39,15 +39,19 @@ const DatabaseEntry GemmRoutineDouble = { { Name{"TITAN X (Pascal) "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { - { kDeviceNameDefault , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, { // Default kDeviceTypeAll, "default", { { "default", { - { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/gemm_routine/gemm_routine_6464.hpp b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp index 044fa5d8..facfd8c8 100644 --- a/src/database/kernels/gemm_routine/gemm_routine_6464.hpp +++ b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp @@ -39,6 +39,10 @@ const DatabaseEntry GemmRoutineComplexDouble = { { Name{"TITAN X (Pascal) "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/invert/invert_32.hpp b/src/database/kernels/invert/invert_32.hpp index aaaec3e5..aeb8a6d4 100644 --- a/src/database/kernels/invert/invert_32.hpp +++ b/src/database/kernels/invert/invert_32.hpp @@ -50,6 +50,10 @@ const DatabaseEntry InvertSingle = { { Name{"GeForce GTX 1070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/invert/invert_3232.hpp b/src/database/kernels/invert/invert_3232.hpp index 5666e9a2..575b104a 100644 --- a/src/database/kernels/invert/invert_3232.hpp +++ b/src/database/kernels/invert/invert_3232.hpp @@ -49,6 +49,10 @@ const DatabaseEntry InvertComplexSingle = { { Name{"GeForce GTX 1070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/invert/invert_64.hpp b/src/database/kernels/invert/invert_64.hpp index 2cd5e369..96a5223a 100644 --- a/src/database/kernels/invert/invert_64.hpp +++ b/src/database/kernels/invert/invert_64.hpp @@ -40,6 +40,10 @@ const DatabaseEntry InvertDouble = { { Name{"GeForce GTX 1070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/invert/invert_6464.hpp b/src/database/kernels/invert/invert_6464.hpp index 4e518e81..ebacda6c 100644 --- a/src/database/kernels/invert/invert_6464.hpp +++ b/src/database/kernels/invert/invert_6464.hpp @@ -40,6 +40,10 @@ const DatabaseEntry InvertComplexDouble = { { Name{"GeForce GTX 1070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/pad/pad_16.hpp b/src/database/kernels/pad/pad_16.hpp index ca5acf75..17367a44 100644 --- a/src/database/kernels/pad/pad_16.hpp +++ b/src/database/kernels/pad/pad_16.hpp @@ -18,6 +18,10 @@ const DatabaseEntry PadHalf = { { Name{"AMD Radeon RX 480 "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/pad/pad_32.hpp b/src/database/kernels/pad/pad_32.hpp index 7e554e31..52529d1e 100644 --- a/src/database/kernels/pad/pad_32.hpp +++ b/src/database/kernels/pad/pad_32.hpp @@ -51,6 +51,10 @@ const DatabaseEntry PadSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -154,6 +158,10 @@ const DatabaseEntry PadSingle = { { Name{"TITAN X (Pascal) "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/pad/pad_3232.hpp b/src/database/kernels/pad/pad_3232.hpp index cea79c14..cfe5c632 100644 --- a/src/database/kernels/pad/pad_3232.hpp +++ b/src/database/kernels/pad/pad_3232.hpp @@ -51,6 +51,10 @@ const DatabaseEntry PadComplexSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -151,6 +155,10 @@ const DatabaseEntry PadComplexSingle = { { Name{"TITAN X (Pascal) "}, Params{ 32, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/pad/pad_64.hpp b/src/database/kernels/pad/pad_64.hpp index e85dcb17..b398eca9 100644 --- a/src/database/kernels/pad/pad_64.hpp +++ b/src/database/kernels/pad/pad_64.hpp @@ -43,6 +43,10 @@ const DatabaseEntry PadDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -136,8 +140,12 @@ const DatabaseEntry PadDouble = { { Name{"TITAN X (Pascal) "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { - { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/pad/pad_6464.hpp b/src/database/kernels/pad/pad_6464.hpp index 52a4d0e0..b3898574 100644 --- a/src/database/kernels/pad/pad_6464.hpp +++ b/src/database/kernels/pad/pad_6464.hpp @@ -43,6 +43,10 @@ const DatabaseEntry PadComplexDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -136,6 +140,10 @@ const DatabaseEntry PadComplexDouble = { { Name{"TITAN X (Pascal) "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/padtranspose/padtranspose_16.hpp b/src/database/kernels/padtranspose/padtranspose_16.hpp index 4a4f619c..13979178 100644 --- a/src/database/kernels/padtranspose/padtranspose_16.hpp +++ b/src/database/kernels/padtranspose/padtranspose_16.hpp @@ -18,6 +18,10 @@ const DatabaseEntry PadtransposeHalf = { { Name{"AMD Radeon RX 480 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/padtranspose/padtranspose_32.hpp b/src/database/kernels/padtranspose/padtranspose_32.hpp index f7adc03c..fb09bc16 100644 --- a/src/database/kernels/padtranspose/padtranspose_32.hpp +++ b/src/database/kernels/padtranspose/padtranspose_32.hpp @@ -51,6 +51,10 @@ const DatabaseEntry PadtransposeSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -153,6 +157,10 @@ const DatabaseEntry PadtransposeSingle = { { Name{"TITAN X (Pascal) "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/padtranspose/padtranspose_3232.hpp b/src/database/kernels/padtranspose/padtranspose_3232.hpp index e37e385a..5aaa946d 100644 --- a/src/database/kernels/padtranspose/padtranspose_3232.hpp +++ b/src/database/kernels/padtranspose/padtranspose_3232.hpp @@ -51,6 +51,10 @@ const DatabaseEntry PadtransposeComplexSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -151,6 +155,10 @@ const DatabaseEntry PadtransposeComplexSingle = { { Name{"TITAN X (Pascal) "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/padtranspose/padtranspose_64.hpp b/src/database/kernels/padtranspose/padtranspose_64.hpp index 77328adb..6ac155d9 100644 --- a/src/database/kernels/padtranspose/padtranspose_64.hpp +++ b/src/database/kernels/padtranspose/padtranspose_64.hpp @@ -43,6 +43,10 @@ const DatabaseEntry PadtransposeDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -136,6 +140,10 @@ const DatabaseEntry PadtransposeDouble = { { Name{"TITAN X (Pascal) "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/padtranspose/padtranspose_6464.hpp b/src/database/kernels/padtranspose/padtranspose_6464.hpp index 8330bffb..8ad7337f 100644 --- a/src/database/kernels/padtranspose/padtranspose_6464.hpp +++ b/src/database/kernels/padtranspose/padtranspose_6464.hpp @@ -43,6 +43,10 @@ const DatabaseEntry PadtransposeComplexDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -136,6 +140,10 @@ const DatabaseEntry PadtransposeComplexDouble = { { Name{"TITAN X (Pascal) "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/transpose/transpose_16.hpp b/src/database/kernels/transpose/transpose_16.hpp index a83e81cc..be06265c 100644 --- a/src/database/kernels/transpose/transpose_16.hpp +++ b/src/database/kernels/transpose/transpose_16.hpp @@ -18,6 +18,10 @@ const DatabaseEntry TransposeHalf = { { Name{"AMD Radeon RX 480 "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/transpose/transpose_32.hpp b/src/database/kernels/transpose/transpose_32.hpp index 7e9dec8c..5aca3154 100644 --- a/src/database/kernels/transpose/transpose_32.hpp +++ b/src/database/kernels/transpose/transpose_32.hpp @@ -51,6 +51,10 @@ const DatabaseEntry TransposeSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 16, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -153,6 +157,10 @@ const DatabaseEntry TransposeSingle = { { Name{"TITAN X (Pascal) "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/transpose/transpose_3232.hpp b/src/database/kernels/transpose/transpose_3232.hpp index f4990954..b7341b88 100644 --- a/src/database/kernels/transpose/transpose_3232.hpp +++ b/src/database/kernels/transpose/transpose_3232.hpp @@ -51,6 +51,10 @@ const DatabaseEntry TransposeComplexSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -143,6 +147,10 @@ const DatabaseEntry TransposeComplexSingle = { { Name{"TITAN X (Pascal) "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/transpose/transpose_64.hpp b/src/database/kernels/transpose/transpose_64.hpp index 9f205565..3e73f07b 100644 --- a/src/database/kernels/transpose/transpose_64.hpp +++ b/src/database/kernels/transpose/transpose_64.hpp @@ -43,9 +43,13 @@ const DatabaseEntry TransposeDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, - { kDeviceNameDefault , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, @@ -136,6 +140,10 @@ const DatabaseEntry TransposeDouble = { { Name{"TITAN X (Pascal) "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/transpose/transpose_6464.hpp b/src/database/kernels/transpose/transpose_6464.hpp index 2029c4fb..d16d9534 100644 --- a/src/database/kernels/transpose/transpose_6464.hpp +++ b/src/database/kernels/transpose/transpose_6464.hpp @@ -43,6 +43,10 @@ const DatabaseEntry TransposeComplexDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -127,6 +131,10 @@ const DatabaseEntry TransposeComplexDouble = { { Name{"TITAN X (Pascal) "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/trsv_routine/trsv_routine_32.hpp b/src/database/kernels/trsv_routine/trsv_routine_32.hpp index 7d9b9372..e6184116 100644 --- a/src/database/kernels/trsv_routine/trsv_routine_32.hpp +++ b/src/database/kernels/trsv_routine/trsv_routine_32.hpp @@ -42,6 +42,10 @@ const DatabaseEntry TrsvRoutineSingle = { { Name{"GeForce GTX 1070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp index 6ec322fa..e479f235 100644 --- a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp +++ b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp @@ -41,6 +41,10 @@ const DatabaseEntry TrsvRoutineComplexSingle = { { Name{"GeForce GTX 1070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/trsv_routine/trsv_routine_64.hpp b/src/database/kernels/trsv_routine/trsv_routine_64.hpp index db65f847..d5c66625 100644 --- a/src/database/kernels/trsv_routine/trsv_routine_64.hpp +++ b/src/database/kernels/trsv_routine/trsv_routine_64.hpp @@ -32,6 +32,10 @@ const DatabaseEntry TrsvRoutineDouble = { { Name{"GeForce GTX 1070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/trsv_routine/trsv_routine_6464.hpp b/src/database/kernels/trsv_routine/trsv_routine_6464.hpp index 4a98b0a7..db52f414 100644 --- a/src/database/kernels/trsv_routine/trsv_routine_6464.hpp +++ b/src/database/kernels/trsv_routine/trsv_routine_6464.hpp @@ -32,6 +32,10 @@ const DatabaseEntry TrsvRoutineComplexDouble = { { Name{"GeForce GTX 1070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xaxpy/xaxpy_16.hpp b/src/database/kernels/xaxpy/xaxpy_16.hpp index ac243f05..a7548c9f 100644 --- a/src/database/kernels/xaxpy/xaxpy_16.hpp +++ b/src/database/kernels/xaxpy/xaxpy_16.hpp @@ -18,6 +18,10 @@ const DatabaseEntry XaxpyHalf = { { Name{"AMD Radeon RX 480 "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xaxpy/xaxpy_32.hpp b/src/database/kernels/xaxpy/xaxpy_32.hpp index 92ea904d..c2eba834 100644 --- a/src/database/kernels/xaxpy/xaxpy_32.hpp +++ b/src/database/kernels/xaxpy/xaxpy_32.hpp @@ -51,6 +51,10 @@ const DatabaseEntry XaxpySingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -154,8 +158,12 @@ const DatabaseEntry XaxpySingle = { { Name{"TITAN X (Pascal) "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { - { kDeviceNameDefault , Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/xaxpy/xaxpy_3232.hpp b/src/database/kernels/xaxpy/xaxpy_3232.hpp index a1c69b83..8f686b3e 100644 --- a/src/database/kernels/xaxpy/xaxpy_3232.hpp +++ b/src/database/kernels/xaxpy/xaxpy_3232.hpp @@ -51,9 +51,13 @@ const DatabaseEntry XaxpyComplexSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, - { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, @@ -151,6 +155,10 @@ const DatabaseEntry XaxpyComplexSingle = { { Name{"TITAN X (Pascal) "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xaxpy/xaxpy_64.hpp b/src/database/kernels/xaxpy/xaxpy_64.hpp index 5cc36a65..4494cf05 100644 --- a/src/database/kernels/xaxpy/xaxpy_64.hpp +++ b/src/database/kernels/xaxpy/xaxpy_64.hpp @@ -43,6 +43,10 @@ const DatabaseEntry XaxpyDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -136,6 +140,10 @@ const DatabaseEntry XaxpyDouble = { { Name{"TITAN X (Pascal) "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xaxpy/xaxpy_6464.hpp b/src/database/kernels/xaxpy/xaxpy_6464.hpp index 07bac80b..aae465cf 100644 --- a/src/database/kernels/xaxpy/xaxpy_6464.hpp +++ b/src/database/kernels/xaxpy/xaxpy_6464.hpp @@ -43,6 +43,10 @@ const DatabaseEntry XaxpyComplexDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -136,6 +140,10 @@ const DatabaseEntry XaxpyComplexDouble = { { Name{"TITAN X (Pascal) "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xdot/xdot_16.hpp b/src/database/kernels/xdot/xdot_16.hpp index 4d1b96d3..f78c1606 100644 --- a/src/database/kernels/xdot/xdot_16.hpp +++ b/src/database/kernels/xdot/xdot_16.hpp @@ -18,6 +18,10 @@ const DatabaseEntry XdotHalf = { { Name{"AMD Radeon RX 480 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xdot/xdot_32.hpp b/src/database/kernels/xdot/xdot_32.hpp index 560e51ca..47c57f12 100644 --- a/src/database/kernels/xdot/xdot_32.hpp +++ b/src/database/kernels/xdot/xdot_32.hpp @@ -47,6 +47,10 @@ const DatabaseEntry XdotSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -137,6 +141,10 @@ const DatabaseEntry XdotSingle = { { Name{"TITAN X (Pascal) "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xdot/xdot_3232.hpp b/src/database/kernels/xdot/xdot_3232.hpp index 63d5f75e..da9275bf 100644 --- a/src/database/kernels/xdot/xdot_3232.hpp +++ b/src/database/kernels/xdot/xdot_3232.hpp @@ -47,6 +47,10 @@ const DatabaseEntry XdotComplexSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -134,6 +138,10 @@ const DatabaseEntry XdotComplexSingle = { { Name{"TITAN X (Pascal) "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xdot/xdot_64.hpp b/src/database/kernels/xdot/xdot_64.hpp index 1a1033ee..240c99fc 100644 --- a/src/database/kernels/xdot/xdot_64.hpp +++ b/src/database/kernels/xdot/xdot_64.hpp @@ -39,6 +39,10 @@ const DatabaseEntry XdotDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -120,6 +124,10 @@ const DatabaseEntry XdotDouble = { { Name{"TITAN X (Pascal) "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xdot/xdot_6464.hpp b/src/database/kernels/xdot/xdot_6464.hpp index 4804a164..a32844ef 100644 --- a/src/database/kernels/xdot/xdot_6464.hpp +++ b/src/database/kernels/xdot/xdot_6464.hpp @@ -39,6 +39,10 @@ const DatabaseEntry XdotComplexDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -120,6 +124,10 @@ const DatabaseEntry XdotComplexDouble = { { Name{"TITAN X (Pascal) "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemm/xgemm_32.hpp b/src/database/kernels/xgemm/xgemm_32.hpp index de4a044c..4805cdb4 100644 --- a/src/database/kernels/xgemm/xgemm_32.hpp +++ b/src/database/kernels/xgemm/xgemm_32.hpp @@ -51,6 +51,10 @@ const DatabaseEntry XgemmSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 0, 1, 32, 2, 8, 16, 128, 8, 8, 128, 0, 0, 1, 1, 8, 8 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 16, 128, 8, 8, 128, 0, 0, 1, 1, 8, 8 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 2, 4 } }, + { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 2, 4 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, @@ -154,6 +158,10 @@ const DatabaseEntry XgemmSingle = { { Name{"TITAN X (Pascal) "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 0, 1, 32, 2, 8, 32, 128, 16, 8, 128, 1, 1, 1, 1, 2, 2 } }, + { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 32, 128, 16, 8, 128, 1, 1, 1, 1, 2, 2 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 32, 32, 64, 1, 1, 0, 0, 4, 2 } }, } }, diff --git a/src/database/kernels/xgemm/xgemm_3232.hpp b/src/database/kernels/xgemm/xgemm_3232.hpp index d55aae56..29ab455c 100644 --- a/src/database/kernels/xgemm/xgemm_3232.hpp +++ b/src/database/kernels/xgemm/xgemm_3232.hpp @@ -151,6 +151,10 @@ const DatabaseEntry XgemmComplexSingle = { { Name{"TITAN X (Pascal) "}, Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 32, 1, 1, 0, 0, 2, 4 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 2, 4 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 64, 1, 0, 0, 1, 2, 2 } }, + { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 64, 1, 0, 0, 1, 2, 2 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 32, 32, 64, 1, 1, 0, 0, 2, 1 } }, } }, diff --git a/src/database/kernels/xgemm/xgemm_64.hpp b/src/database/kernels/xgemm/xgemm_64.hpp index 2bc811d5..2549ea38 100644 --- a/src/database/kernels/xgemm/xgemm_64.hpp +++ b/src/database/kernels/xgemm/xgemm_64.hpp @@ -136,6 +136,10 @@ const DatabaseEntry XgemmDouble = { { Name{"TITAN X (Pascal) "}, Params{ 0, 1, 32, 2, 32, 32, 32, 16, 16, 32, 0, 0, 0, 0, 1, 2 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 64, 0, 0, 0, 0, 2, 4 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1, 2, 1, 1, 16, 16, 32, 8, 8, 16, 0, 0, 0, 0, 2, 1 } }, + { kDeviceNameDefault , Params{ 1, 2, 1, 1, 16, 16, 32, 8, 8, 16, 0, 0, 0, 0, 2, 1 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 32, 1, 1, 0, 0, 2, 2 } }, } }, diff --git a/src/database/kernels/xgemm/xgemm_6464.hpp b/src/database/kernels/xgemm/xgemm_6464.hpp index a3314f93..5f88e319 100644 --- a/src/database/kernels/xgemm/xgemm_6464.hpp +++ b/src/database/kernels/xgemm/xgemm_6464.hpp @@ -135,6 +135,10 @@ const DatabaseEntry XgemmComplexDouble = { { Name{"TITAN X (Pascal) "}, Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, { kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 32, 32, 32, 64, 0, 0, 0, 0, 1, 2 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, + { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 16, 1, 1, 0, 0, 1, 2 } }, } }, diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp index 0ca997fa..9168ce4d 100644 --- a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp +++ b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp @@ -113,6 +113,10 @@ const DatabaseEntry XgemmDirectSingle = { { Name{"TITAN X (Pascal) "}, Params{ 8, 32, 8, 8, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 16, 8, 16, 32, 8, 1, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 16, 8, 16, 32, 8, 1, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp index fbd04af8..9b2ed40a 100644 --- a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp +++ b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp @@ -106,6 +106,10 @@ const DatabaseEntry XgemmDirectComplexSingle = { { Name{"TITAN X (Pascal) "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp index d589f916..62206ec0 100644 --- a/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp +++ b/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp @@ -99,6 +99,10 @@ const DatabaseEntry XgemmDirectDouble = { { Name{"TITAN X (Pascal) "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp index e183985a..da0710be 100644 --- a/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp +++ b/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp @@ -99,6 +99,10 @@ const DatabaseEntry XgemmDirectComplexDouble = { { Name{"TITAN X (Pascal) "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemv/xgemv_32.hpp b/src/database/kernels/xgemv/xgemv_32.hpp index fca974ec..7f788521 100644 --- a/src/database/kernels/xgemv/xgemv_32.hpp +++ b/src/database/kernels/xgemv/xgemv_32.hpp @@ -152,6 +152,10 @@ const DatabaseEntry XgemvSingle = { { Name{"TITAN X (Pascal) "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemv/xgemv_3232.hpp b/src/database/kernels/xgemv/xgemv_3232.hpp index a09524de..2025867f 100644 --- a/src/database/kernels/xgemv/xgemv_3232.hpp +++ b/src/database/kernels/xgemv/xgemv_3232.hpp @@ -145,6 +145,10 @@ const DatabaseEntry XgemvComplexSingle = { { Name{"TITAN X (Pascal) "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemv/xgemv_64.hpp b/src/database/kernels/xgemv/xgemv_64.hpp index b5a86a5f..71a6878f 100644 --- a/src/database/kernels/xgemv/xgemv_64.hpp +++ b/src/database/kernels/xgemv/xgemv_64.hpp @@ -127,8 +127,12 @@ const DatabaseEntry XgemvDouble = { { Name{"TITAN X (Pascal) "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { - { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/xgemv/xgemv_6464.hpp b/src/database/kernels/xgemv/xgemv_6464.hpp index d05efd2e..b380e440 100644 --- a/src/database/kernels/xgemv/xgemv_6464.hpp +++ b/src/database/kernels/xgemv/xgemv_6464.hpp @@ -108,8 +108,12 @@ const DatabaseEntry XgemvComplexDouble = { { Name{"GeForce GTX 1080 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { - { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp index 6271eab7..45cc0bed 100644 --- a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp +++ b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp @@ -152,6 +152,10 @@ const DatabaseEntry XgemvFastSingle = { { Name{"TITAN X (Pascal) "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp index bfe475cb..a440f463 100644 --- a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp +++ b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp @@ -128,6 +128,10 @@ const DatabaseEntry XgemvFastComplexSingle = { { Name{"GeForce GTX 1080 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp index 0383bca3..c5470eaf 100644 --- a/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp +++ b/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp @@ -127,8 +127,12 @@ const DatabaseEntry XgemvFastDouble = { { Name{"TITAN X (Pascal) "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { - { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp index 005eccb3..3b0a21aa 100644 --- a/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp +++ b/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp @@ -104,6 +104,10 @@ const DatabaseEntry XgemvFastComplexDouble = { { Name{"GeForce GTX 1080 Ti "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp index 212e9c2f..7b7b7a27 100644 --- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp +++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp @@ -115,6 +115,10 @@ const DatabaseEntry XgemvFastRotSingle = { { Name{"TITAN X (Pascal) "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp index 73bf5472..c495b2a8 100644 --- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp +++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp @@ -96,6 +96,10 @@ const DatabaseEntry XgemvFastRotComplexSingle = { { Name{"GeForce GTX 1080 Ti "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp index d372ad90..6a10f4b6 100644 --- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp +++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp @@ -92,6 +92,10 @@ const DatabaseEntry XgemvFastRotDouble = { { Name{"TITAN X (Pascal) "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp index 3ad4170e..a7e3acc3 100644 --- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp +++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp @@ -76,8 +76,12 @@ const DatabaseEntry XgemvFastRotComplexDouble = { { Name{"GeForce GTX 1080 Ti "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { - { kDeviceNameDefault , Params{ 4, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/xger/xger_16.hpp b/src/database/kernels/xger/xger_16.hpp index 8c86df38..8b08a401 100644 --- a/src/database/kernels/xger/xger_16.hpp +++ b/src/database/kernels/xger/xger_16.hpp @@ -18,6 +18,10 @@ const DatabaseEntry XgerHalf = { { Name{"AMD Radeon RX 480 "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, @@ -52,7 +56,7 @@ const DatabaseEntry XgerHalf = { { // Default kDeviceTypeAll, "default", { { "default", { - { kDeviceNameDefault , Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/xger/xger_32.hpp b/src/database/kernels/xger/xger_32.hpp index 7455fdba..0b99d911 100644 --- a/src/database/kernels/xger/xger_32.hpp +++ b/src/database/kernels/xger/xger_32.hpp @@ -51,6 +51,10 @@ const DatabaseEntry XgerSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 16, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -140,8 +144,12 @@ const DatabaseEntry XgerSingle = { { Name{"TITAN X (Pascal) "}, Params{ 512, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { - { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, } }, diff --git a/src/database/kernels/xger/xger_3232.hpp b/src/database/kernels/xger/xger_3232.hpp index af0431a7..7f547e1e 100644 --- a/src/database/kernels/xger/xger_3232.hpp +++ b/src/database/kernels/xger/xger_3232.hpp @@ -51,6 +51,10 @@ const DatabaseEntry XgerComplexSingle = { { Name{"ATI Radeon HD 6750M "}, Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -137,6 +141,10 @@ const DatabaseEntry XgerComplexSingle = { { Name{"TITAN X (Pascal) "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xger/xger_64.hpp b/src/database/kernels/xger/xger_64.hpp index f531dd5a..857efaa3 100644 --- a/src/database/kernels/xger/xger_64.hpp +++ b/src/database/kernels/xger/xger_64.hpp @@ -43,6 +43,10 @@ const DatabaseEntry XgerDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -123,6 +127,10 @@ const DatabaseEntry XgerDouble = { { Name{"TITAN X (Pascal) "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/database/kernels/xger/xger_6464.hpp b/src/database/kernels/xger/xger_6464.hpp index b58867f1..bfeb114e 100644 --- a/src/database/kernels/xger/xger_6464.hpp +++ b/src/database/kernels/xger/xger_6464.hpp @@ -43,6 +43,10 @@ const DatabaseEntry XgerComplexDouble = { { Name{"AMD Radeon R9 380 "}, Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "Vega", { + { Name{"Radeon RX Vega "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, @@ -123,6 +127,10 @@ const DatabaseEntry XgerComplexDouble = { { Name{"TITAN X (Pascal) "}, Params{ 4, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, + { "SM7.5", { + { Name{"TITAN RTX "}, Params{ 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, { "default", { { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index e65acf17..85cbdc86 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -64,7 +64,7 @@ void Xamax(const int n, #else x = fabs(x); #endif - if (x >= max) { + if (x > max) { max = x; imax = id*x_inc + x_offset; } @@ -77,7 +77,7 @@ void Xamax(const int n, // Performs reduction in local memory for (int s=WGS1/2; s>0; s=s>>1) { if (lid < s) { - if (maxlm[lid + s] >= maxlm[lid]) { + if (maxlm[lid + s] > maxlm[lid]) { maxlm[lid] = maxlm[lid + s]; imaxlm[lid] = imaxlm[lid + s]; } @@ -105,7 +105,7 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm, const int lid = get_local_id(0); // Performs the first step of the reduction while loading the data - if (maxgm[lid + WGS2] >= maxgm[lid]) { + if (maxgm[lid + WGS2] > maxgm[lid]) { maxlm[lid] = maxgm[lid + WGS2]; imaxlm[lid] = imaxgm[lid + WGS2]; } @@ -118,7 +118,7 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm, // Performs reduction in local memory for (int s=WGS2/2; s>0; s=s>>1) { if (lid < s) { - if (maxlm[lid + s] >= maxlm[lid]) { + if (maxlm[lid + s] > maxlm[lid]) { maxlm[lid] = maxlm[lid + s]; imaxlm[lid] = imaxlm[lid + s]; } diff --git a/src/pyclblast/samples/saxpybatched.py b/src/pyclblast/samples/saxpybatched.py new file mode 100644 index 00000000..fa523945 --- /dev/null +++ b/src/pyclblast/samples/saxpybatched.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. +# This file follows the PEP8 Python style guide and uses a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren <www.cedricnugteren.nl> + +import numpy as np +import pyopencl as cl +from pyopencl.array import Array +import pyclblast + +# Settings for this sample: +batch_count = 2 +dtype = 'float32' +alphas = [1.5, 1.0] +n = 4 + +print("# Setting up OpenCL") +ctx = cl.create_some_context() +queue = cl.CommandQueue(ctx) + +print("# Setting up Numpy arrays") +x = np.random.rand(n * batch_count).astype(dtype=dtype) +y = np.random.rand(n * batch_count).astype(dtype=dtype) + +print("# Batch offsets: next after each other") +x_offsets = [0, n] +y_offsets = [0, n] + +print("# Setting up OpenCL arrays") +clx = Array(queue, x.shape, x.dtype) +cly = Array(queue, y.shape, y.dtype) +clx.set(x) +cly.set(y) + +print("# Example level-1 batched operation: AXPY-batched") +assert len(alphas) == len(x_offsets) == len(y_offsets) == batch_count +pyclblast.axpyBatched(queue, n, clx, cly, alphas, x_offsets, y_offsets) +queue.finish() + +print("# Full result for vector y: %s" % str(cly.get())) +for i in range(batch_count): + result = alphas[i] * x[x_offsets[i]:x_offsets[i] + n] + y[y_offsets[i]:y_offsets[i] + n] + print("# Expected result batch #%d: %s" % (i, str(result))) diff --git a/src/pyclblast/setup.py b/src/pyclblast/setup.py index 1c1bf3ab..bcc966ed 100644 --- a/src/pyclblast/setup.py +++ b/src/pyclblast/setup.py @@ -22,7 +22,7 @@ ext_modules.append( setup( name="pyclblast", - version="1.2.0", + version="1.3.0", author="Cedric Nugteren", author_email="web@cedricnugteren.nl", url="https://github.com/CNugteren/CLBlast/blob/master/src/pyclblast", diff --git a/src/pyclblast/src/pyclblast.pyx b/src/pyclblast/src/pyclblast.pyx index 14efcf8a..eb46649f 100644 --- a/src/pyclblast/src/pyclblast.pyx +++ b/src/pyclblast/src/pyclblast.pyx @@ -364,6 +364,7 @@ def swap(queue, n, x, y, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0): err = CLBlastHswap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXswap' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -405,6 +406,7 @@ def scal(queue, n, x, x_inc = 1, alpha = 1.0, x_offset = 0): err = CLBlastHscal(n, <cl_half>alpha, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXscal' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -448,6 +450,7 @@ def copy(queue, n, x, y, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0): err = CLBlastHcopy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXcopy' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -491,6 +494,7 @@ def axpy(queue, n, x, y, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset = 0, y_offs err = CLBlastHaxpy(n, <cl_half>alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXaxpy' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -530,6 +534,7 @@ def dot(queue, n, x, y, dot, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0, d err = CLBlastHdot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXdot' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -566,6 +571,7 @@ def dotu(queue, n, x, y, dot, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0, err = CLBlastZdotu(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXdotu' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -602,6 +608,7 @@ def dotc(queue, n, x, y, dot, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0, err = CLBlastZdotc(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXdotc' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -645,6 +652,7 @@ def nrm2(queue, n, x, nrm2, x_inc = 1, x_offset = 0, nrm2_offset = 0): err = CLBlastHnrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXnrm2' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -688,6 +696,7 @@ def asum(queue, n, x, asum, x_inc = 1, x_offset = 0, asum_offset = 0): err = CLBlastHasum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXasum' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -731,6 +740,7 @@ def sum(queue, n, x, sum, x_inc = 1, x_offset = 0, sum_offset = 0): err = CLBlastHsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsum' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -774,6 +784,7 @@ def amax(queue, n, x, imax, x_inc = 1, x_offset = 0, imax_offset = 0): err = CLBlastiHamax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXamax' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -817,6 +828,7 @@ def amin(queue, n, x, imin, x_inc = 1, x_offset = 0, imin_offset = 0): err = CLBlastiHamin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXamin' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -860,6 +872,7 @@ def max(queue, n, x, imax, x_inc = 1, x_offset = 0, imax_offset = 0): err = CLBlastiHmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXmax' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -903,6 +916,7 @@ def min(queue, n, x, imin, x_inc = 1, x_offset = 0, imin_offset = 0): err = CLBlastiHmin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXmin' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -949,6 +963,7 @@ def gemv(queue, m, n, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0 err = CLBlastHgemv(CLBlastLayoutRowMajor, a_transpose, m, n, <cl_half>alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXgemv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -995,6 +1010,7 @@ def gbmv(queue, m, n, kl, ku, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, err = CLBlastHgbmv(CLBlastLayoutRowMajor, a_transpose, m, n, kl, ku, <cl_half>alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXgbmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1032,6 +1048,7 @@ def hemv(queue, n, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0, err = CLBlastZhemv(CLBlastLayoutRowMajor, triangle, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_double2>cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXhemv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1069,6 +1086,7 @@ def hbmv(queue, n, k, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0 err = CLBlastZhbmv(CLBlastLayoutRowMajor, triangle, n, k, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_double2>cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXhbmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1106,6 +1124,7 @@ def hpmv(queue, n, ap, x, y, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0. err = CLBlastZhpmv(CLBlastLayoutRowMajor, triangle, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), ap_buffer, ap_offset, x_buffer, x_offset, x_inc, <cl_double2>cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXhpmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1146,6 +1165,7 @@ def symv(queue, n, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0, err = CLBlastHsymv(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsymv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1186,6 +1206,7 @@ def sbmv(queue, n, k, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0 err = CLBlastHsbmv(CLBlastLayoutRowMajor, triangle, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsbmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1226,6 +1247,7 @@ def spmv(queue, n, ap, x, y, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0. err = CLBlastHspmv(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXspmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1272,6 +1294,7 @@ def trmv(queue, n, a, x, a_ld, x_inc = 1, lower_triangle = False, a_transp = Fal err = CLBlastHtrmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXtrmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1318,6 +1341,7 @@ def tbmv(queue, n, k, a, x, a_ld, x_inc = 1, lower_triangle = False, a_transp = err = CLBlastHtbmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXtbmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1364,6 +1388,7 @@ def tpmv(queue, n, ap, x, ap_ld, x_inc = 1, lower_triangle = False, a_transp = F err = CLBlastHtpmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXtpmv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1407,6 +1432,7 @@ def trsv(queue, n, a, x, a_ld, x_inc = 1, lower_triangle = False, a_transp = Fal err = CLBlastZtrsv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXtrsv' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1446,6 +1472,7 @@ def ger(queue, m, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset err = CLBlastHger(CLBlastLayoutRowMajor, m, n, <cl_half>alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXger' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1482,6 +1509,7 @@ def geru(queue, m, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset err = CLBlastZgeru(CLBlastLayoutRowMajor, m, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXgeru' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1518,6 +1546,7 @@ def gerc(queue, m, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset err = CLBlastZgerc(CLBlastLayoutRowMajor, m, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXgerc' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1553,6 +1582,7 @@ def her(queue, n, x, a, a_ld, x_inc = 1, alpha = 1.0, lower_triangle = False, x_ err = CLBlastZher(CLBlastLayoutRowMajor, triangle, n, <cl_double>alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXher' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1588,6 +1618,7 @@ def hpr(queue, n, x, ap, ap_ld, x_inc = 1, alpha = 1.0, lower_triangle = False, err = CLBlastZhpr(CLBlastLayoutRowMajor, triangle, n, <cl_double>alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXhpr' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1625,6 +1656,7 @@ def her2(queue, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_trian err = CLBlastZher2(CLBlastLayoutRowMajor, triangle, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXher2' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1662,6 +1694,7 @@ def hpr2(queue, n, x, y, ap, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_tri err = CLBlastZhpr2(CLBlastLayoutRowMajor, triangle, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXhpr2' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1700,6 +1733,7 @@ def syr(queue, n, x, a, a_ld, x_inc = 1, alpha = 1.0, lower_triangle = False, x_ err = CLBlastHsyr(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsyr' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1738,6 +1772,7 @@ def spr(queue, n, x, ap, ap_ld, x_inc = 1, alpha = 1.0, lower_triangle = False, err = CLBlastHspr(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXspr' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1778,6 +1813,7 @@ def syr2(queue, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_trian err = CLBlastHsyr2(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsyr2' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1818,6 +1854,7 @@ def spr2(queue, n, x, y, ap, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_tri err = CLBlastHspr2(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXspr2' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1865,6 +1902,7 @@ def gemm(queue, m, n, k, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, a_t err = CLBlastHgemm(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_half>beta, c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXgemm' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1912,6 +1950,7 @@ def symm(queue, m, n, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, right_ err = CLBlastHsymm(CLBlastLayoutRowMajor, side, triangle, m, n, <cl_half>alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_half>beta, c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsymm' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1950,6 +1989,7 @@ def hemm(queue, m, n, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, right_ err = CLBlastZhemm(CLBlastLayoutRowMajor, side, triangle, m, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_double2>cl_double2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXhemm' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -1995,6 +2035,7 @@ def syrk(queue, n, k, a, c, a_ld, c_ld, alpha = 1.0, beta = 0.0, lower_triangle err = CLBlastHsyrk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, <cl_half>beta, c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsyrk' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -2031,6 +2072,7 @@ def herk(queue, n, k, a, c, a_ld, c_ld, alpha = 1.0, beta = 0.0, lower_triangle err = CLBlastZherk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, <cl_double>alpha, a_buffer, a_offset, a_ld, <cl_double>beta, c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXherk' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -2078,6 +2120,7 @@ def syr2k(queue, n, k, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, lower err = CLBlastHsyr2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_half>beta, c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXsyr2k' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -2116,6 +2159,7 @@ def her2k(queue, n, k, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, lower err = CLBlastZher2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_double>beta, c_buffer, c_offset, c_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXher2k' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -2163,6 +2207,7 @@ def trmm(queue, m, n, a, b, a_ld, b_ld, alpha = 1.0, right_side = False, lower_t err = CLBlastHtrmm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, <cl_half>alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXtrmm' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) @@ -2207,11 +2252,223 @@ def trsm(queue, m, n, a, b, a_ld, b_ld, alpha = 1.0, right_side = False, lower_t err = CLBlastZtrsm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event) else: raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + if err != CLBlastSuccess: raise RuntimeError("PyCLBlast: 'CLBlastXtrsm' failed: %s" % get_status_message(err)) return cl.Event.from_int_ptr(<size_t>event) #################################################################################################### +# Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +#################################################################################################### + +cdef extern from "clblast_c.h": + CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, const float *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event) + CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, const double *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event) + CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, const cl_float2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event) + CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, const cl_double2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event) + CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, const cl_half *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event) + +def axpyBatched(queue, n, x, y, alphas, x_offsets, y_offsets, x_inc = 1, y_inc = 1): + """ + xAXPYBATCHED: Batched version of AXPY + """ + + dtype = check_dtype([x, y], ["float32", "float64", "complex64", "complex128", "float16"]) + check_vector(x, "x") + check_vector(y, "y") + + if len(x_offsets) != len(y_offsets) != len(alphas): + raise RuntimeError("PyCLBlast: 'CLBlastXaxpyBatched' failed: length of batch-sized arguments x_offsets, y_offsets, alphas should be equal") + batch_count = len(x_offsets) + + cdef size_t *x_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t)) + for i in range(batch_count): + x_offsets_c[i] = x_offsets[i] + cdef size_t *y_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t)) + for i in range(batch_count): + y_offsets_c[i] = y_offsets[i] + cdef void *alphas_c = <void *> PyMem_Malloc(batch_count * sizeof(dtype_size[dtype])) + for i in range(batch_count): + if dtype == np.dtype("float32"): + (<cl_float*>alphas_c)[i] = <cl_float>alphas[i] + elif dtype == np.dtype("float64"): + (<cl_double*>alphas_c)[i] = <cl_double>alphas[i] + elif dtype == np.dtype("complex64"): + (<cl_float2*>alphas_c)[i] = <cl_float2>cl_float2(x=alphas[i].real,y=alphas[i].imag) + elif dtype == np.dtype("complex128"): + (<cl_double2*>alphas_c)[i] = <cl_double2>cl_double2(x=alphas[i].real,y=alphas[i].imag) + elif dtype == np.dtype("float16"): + (<cl_half*>alphas_c)[i] = <cl_half>alphas[i] + + cdef cl_mem x_buffer = <cl_mem><size_t>x.base_data.int_ptr + cdef cl_mem y_buffer = <cl_mem><size_t>y.base_data.int_ptr + + cdef cl_command_queue command_queue = <cl_command_queue><size_t>queue.int_ptr + cdef cl_event event = NULL + + cdef CLBlastStatusCode err + if dtype == np.dtype("float32"): + err = CLBlastSaxpyBatched(n, <cl_float*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event) + elif dtype == np.dtype("float64"): + err = CLBlastDaxpyBatched(n, <cl_double*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event) + elif dtype == np.dtype("complex64"): + err = CLBlastCaxpyBatched(n, <cl_float2*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event) + elif dtype == np.dtype("complex128"): + err = CLBlastZaxpyBatched(n, <cl_double2*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event) + elif dtype == np.dtype("float16"): + err = CLBlastHaxpyBatched(n, <cl_half*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event) + else: + raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + + PyMem_Free(x_offsets_c) + PyMem_Free(y_offsets_c) + PyMem_Free(alphas_c) + + if err != CLBlastSuccess: + raise RuntimeError("PyCLBlast: 'CLBlastXaxpyBatched' failed: %s" % get_status_message(err)) + return cl.Event.from_int_ptr(<size_t>event) + +#################################################################################################### +# Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED +#################################################################################################### + +cdef extern from "clblast_c.h": + CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const float *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event) + CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const double *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event) + CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_float2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event) + CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_double2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event) + CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_half *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event) + +def gemmBatched(queue, m, n, k, a, b, c, alphas, betas, a_ld, b_ld, c_ld, a_offsets, b_offsets, c_offsets, a_transp = False, b_transp = False): + """ + xGEMMBATCHED: Batched version of GEMM + """ + + dtype = check_dtype([a, b, c], ["float32", "float64", "complex64", "complex128", "float16"]) + check_matrix(a, "a") + check_matrix(b, "b") + check_matrix(c, "c") + + if len(a_offsets) != len(b_offsets) != len(c_offsets) != len(alphas) != len(betas): + raise RuntimeError("PyCLBlast: 'CLBlastXgemmBatched' failed: length of batch-sized arguments a_offsets, b_offsets, c_offsets, alphas, betas should be equal") + batch_count = len(a_offsets) + + cdef size_t *a_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t)) + for i in range(batch_count): + a_offsets_c[i] = a_offsets[i] + cdef size_t *b_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t)) + for i in range(batch_count): + b_offsets_c[i] = b_offsets[i] + cdef size_t *c_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t)) + for i in range(batch_count): + c_offsets_c[i] = c_offsets[i] + cdef void *alphas_c = <void *> PyMem_Malloc(batch_count * sizeof(dtype_size[dtype])) + for i in range(batch_count): + if dtype == np.dtype("float32"): + (<cl_float*>alphas_c)[i] = <cl_float>alphas[i] + elif dtype == np.dtype("float64"): + (<cl_double*>alphas_c)[i] = <cl_double>alphas[i] + elif dtype == np.dtype("complex64"): + (<cl_float2*>alphas_c)[i] = <cl_float2>cl_float2(x=alphas[i].real,y=alphas[i].imag) + elif dtype == np.dtype("complex128"): + (<cl_double2*>alphas_c)[i] = <cl_double2>cl_double2(x=alphas[i].real,y=alphas[i].imag) + elif dtype == np.dtype("float16"): + (<cl_half*>alphas_c)[i] = <cl_half>alphas[i] + cdef void *betas_c = <void *> PyMem_Malloc(batch_count * sizeof(dtype_size[dtype])) + for i in range(batch_count): + if dtype == np.dtype("float32"): + (<cl_float*>betas_c)[i] = <cl_float>betas[i] + elif dtype == np.dtype("float64"): + (<cl_double*>betas_c)[i] = <cl_double>betas[i] + elif dtype == np.dtype("complex64"): + (<cl_float2*>betas_c)[i] = <cl_float2>cl_float2(x=betas[i].real,y=betas[i].imag) + elif dtype == np.dtype("complex128"): + (<cl_double2*>betas_c)[i] = <cl_double2>cl_double2(x=betas[i].real,y=betas[i].imag) + elif dtype == np.dtype("float16"): + (<cl_half*>betas_c)[i] = <cl_half>betas[i] + + cdef cl_mem a_buffer = <cl_mem><size_t>a.base_data.int_ptr + cdef cl_mem b_buffer = <cl_mem><size_t>b.base_data.int_ptr + cdef cl_mem c_buffer = <cl_mem><size_t>c.base_data.int_ptr + + cdef cl_command_queue command_queue = <cl_command_queue><size_t>queue.int_ptr + cdef cl_event event = NULL + a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo + b_transpose = CLBlastTransposeYes if b_transp else CLBlastTransposeNo + + cdef CLBlastStatusCode err + if dtype == np.dtype("float32"): + err = CLBlastSgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_float*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_float*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event) + elif dtype == np.dtype("float64"): + err = CLBlastDgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_double*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_double*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event) + elif dtype == np.dtype("complex64"): + err = CLBlastCgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_float2*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_float2*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event) + elif dtype == np.dtype("complex128"): + err = CLBlastZgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_double2*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_double2*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event) + elif dtype == np.dtype("float16"): + err = CLBlastHgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_half*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_half*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event) + else: + raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + + PyMem_Free(a_offsets_c) + PyMem_Free(b_offsets_c) + PyMem_Free(c_offsets_c) + PyMem_Free(alphas_c) + PyMem_Free(betas_c) + + if err != CLBlastSuccess: + raise RuntimeError("PyCLBlast: 'CLBlastXgemmBatched' failed: %s" % get_status_message(err)) + return cl.Event.from_int_ptr(<size_t>event) + +#################################################################################################### +# StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED +#################################################################################################### + +cdef extern from "clblast_c.h": + CLBlastStatusCode CLBlastSgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event) + CLBlastStatusCode CLBlastDgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event) + CLBlastStatusCode CLBlastCgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event) + CLBlastStatusCode CLBlastZgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event) + CLBlastStatusCode CLBlastHgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event) + +def gemmStridedBatched(queue, m, n, k, batch_count, a, b, c, a_ld, b_ld, c_ld, a_stride, b_stride, c_stride, alpha = 1.0, beta = 0.0, a_transp = False, b_transp = False, a_offset = 0, b_offset = 0, c_offset = 0): + """ + xGEMMSTRIDEDBATCHED: StridedBatched version of GEMM + """ + + dtype = check_dtype([a, b, c], ["float32", "float64", "complex64", "complex128", "float16"]) + check_matrix(a, "a") + check_matrix(b, "b") + check_matrix(c, "c") + + cdef cl_mem a_buffer = <cl_mem><size_t>a.base_data.int_ptr + cdef cl_mem b_buffer = <cl_mem><size_t>b.base_data.int_ptr + cdef cl_mem c_buffer = <cl_mem><size_t>c.base_data.int_ptr + + cdef cl_command_queue command_queue = <cl_command_queue><size_t>queue.int_ptr + cdef cl_event event = NULL + a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo + b_transpose = CLBlastTransposeYes if b_transp else CLBlastTransposeNo + + cdef CLBlastStatusCode err + if dtype == np.dtype("float32"): + err = CLBlastSgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_float>alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_float>beta, c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event) + elif dtype == np.dtype("float64"): + err = CLBlastDgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_double>alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_double>beta, c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event) + elif dtype == np.dtype("complex64"): + err = CLBlastCgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_float2>cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_float2>cl_float2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event) + elif dtype == np.dtype("complex128"): + err = CLBlastZgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_double2>cl_double2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event) + elif dtype == np.dtype("float16"): + err = CLBlastHgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_half>beta, c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event) + else: + raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype) + + if err != CLBlastSuccess: + raise RuntimeError("PyCLBlast: 'CLBlastXgemmStridedBatched' failed: %s" % get_status_message(err)) + return cl.Event.from_int_ptr(<size_t>event) + +#################################################################################################### # Overrides the parameters #################################################################################################### diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp index c5eee527..be6b233e 100644 --- a/src/tuning/tuning.cpp +++ b/src/tuning/tuning.cpp @@ -82,7 +82,7 @@ void PrintTimingsToFileAsJSON(const std::string &filename, void print_separator(const size_t parameters_size) { printf("x------x-------x"); for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); } - printf("-x----------------x--------------x--------x-------------------x\n"); + printf("-x-----------------x-----------------x----------------x--------------x--------x-------------------x\n"); } // ================================================================================================= @@ -145,6 +145,7 @@ void Tuner(int argc, char* argv[], const int V, const auto platform = Platform(args.platform_id); const auto device = Device(platform, args.device_id); const auto context = Context(device); + auto queue = Queue(context, device); // Tests for validity of the precision and retrieves properties if (!PrecisionSupported<T>(device)) { @@ -203,12 +204,11 @@ void Tuner(int argc, char* argv[], const int V, printf("\n"); printf("| ID | total |"); for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); } - printf("param | compiles | time | %6s | status |\n", settings.performance_unit.c_str()); + printf("param | local | global | compiles | time | %6s | status |\n", settings.performance_unit.c_str()); print_separator(settings.parameters.size()); // First runs a reference example to compare against try { - auto queue = Queue(context, device); printf("| ref | - |"); for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); } printf(" - |"); @@ -219,6 +219,16 @@ void Tuner(int argc, char* argv[], const int V, device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); } + // Sets the thread configuration + auto global = settings.global_size_ref; + auto local = settings.local_size_ref; + + // Make sure that the global worksize is a multiple of the local + for (auto i=size_t{0}; i<global.size(); ++i) { + while ((global[i] / local[i]) * local[i] != global[i]) { global[i]++; } + } + printf("%8zu%8zu |%8zu%8zu |", local[0], local[1], global[0], global[1]); + // Compiles the kernel auto compiler_options = std::vector<std::string>(); const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name, @@ -229,7 +239,7 @@ void Tuner(int argc, char* argv[], const int V, // Runs the kernel const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, - settings.global_size_ref, settings.local_size_ref); + global, local); printf(" - |"); if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); } @@ -251,8 +261,6 @@ void Tuner(int argc, char* argv[], const int V, auto results = std::vector<TuningResult>(); for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) { try { - auto queue = Queue(context, device); - auto configuration = configurations[config_id]; printf("| %4zu | %5zu |", config_id + 1, configurations.size()); for (const auto& parameter : settings.parameters) { @@ -266,10 +274,16 @@ void Tuner(int argc, char* argv[], const int V, } // Sets the thread configuration - const auto global = SetThreadConfiguration(configuration, settings.global_size, - settings.mul_global, settings.div_global); - const auto local = SetThreadConfiguration(configuration, settings.local_size, - settings.mul_local, settings.div_local); + auto global = SetThreadConfiguration(configuration, settings.global_size, + settings.mul_global, settings.div_global); + auto local = SetThreadConfiguration(configuration, settings.local_size, + settings.mul_local, settings.div_local); + + // Make sure that the global worksize is a multiple of the local + for (auto i=size_t{0}; i<global.size(); ++i) { + while ((global[i] / local[i]) * local[i] != global[i]) { global[i]++; } + } + printf("%8zu%8zu |%8zu%8zu |", local[0], local[1], global[0], global[1]); // Sets the parameters for this configuration auto kernel_source = std::string{""}; |