summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-11-07 22:20:13 +0100
committerGitHub <noreply@github.com>2017-11-07 22:20:13 +0100
commitb18cc9d3f18accf88c9551c98c51b99add57b96c (patch)
treea9017ad18e161647b05ba6c597dfe8ae5125298b
parent061b1c571b86714f1d323563a9ac587a850ecddc (diff)
parent6fe9916231a0c6316e3427aaed3be281080a2692 (diff)
Merge pull request #212 from CNugteren/kernel_selection_tuner
GEMM kernel selection tuner
-rw-r--r--CHANGELOG1
-rw-r--r--CMakeLists.txt12
-rw-r--r--README.md2
-rw-r--r--ROADMAP.md6
-rw-r--r--scripts/database/database/bests.py10
-rw-r--r--scripts/database/database/clblast.py3
-rw-r--r--scripts/database/database/defaults.py5
-rw-r--r--scripts/database/database/io.py2
-rw-r--r--src/database/database.cpp5
-rw-r--r--src/database/kernel_selection.hpp136
-rw-r--r--src/database/kernels/copy/copy_64.hpp2
-rw-r--r--src/database/kernels/copy/copy_6464.hpp2
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine.hpp14
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_16.hpp34
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_32.hpp58
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_3232.hpp58
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_64.hpp50
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_6464.hpp50
-rw-r--r--src/database/kernels/pad/pad_3232.hpp2
-rw-r--r--src/database/kernels/padtranspose/padtranspose_32.hpp2
-rw-r--r--src/database/kernels/padtranspose/padtranspose_3232.hpp2
-rw-r--r--src/database/kernels/padtranspose/padtranspose_64.hpp2
-rw-r--r--src/database/kernels/transpose/transpose_3232.hpp2
-rw-r--r--src/database/kernels/transpose/transpose_64.hpp2
-rw-r--r--src/database/kernels/transpose/transpose_6464.hpp4
-rw-r--r--src/database/kernels/xaxpy/xaxpy_32.hpp4
-rw-r--r--src/database/kernels/xaxpy/xaxpy_3232.hpp2
-rw-r--r--src/database/kernels/xaxpy/xaxpy_64.hpp2
-rw-r--r--src/database/kernels/xdot/xdot_32.hpp8
-rw-r--r--src/database/kernels/xdot/xdot_3232.hpp2
-rw-r--r--src/database/kernels/xdot/xdot_6464.hpp4
-rw-r--r--src/database/kernels/xgemm/xgemm_32.hpp10
-rw-r--r--src/database/kernels/xgemm/xgemm_3232.hpp14
-rw-r--r--src/database/kernels/xgemm/xgemm_64.hpp16
-rw-r--r--src/database/kernels/xgemm/xgemm_6464.hpp12
-rw-r--r--src/database/kernels/xgemm_direct/xgemm_direct_32.hpp1
-rw-r--r--src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp5
-rw-r--r--src/database/kernels/xgemm_direct/xgemm_direct_64.hpp5
-rw-r--r--src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp1
-rw-r--r--src/database/kernels/xgemv/xgemv_32.hpp4
-rw-r--r--src/database/kernels/xgemv/xgemv_3232.hpp2
-rw-r--r--src/database/kernels/xgemv/xgemv_64.hpp2
-rw-r--r--src/database/kernels/xgemv_fast/xgemv_fast_64.hpp2
-rw-r--r--src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp2
-rw-r--r--src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp2
-rw-r--r--src/database/kernels/xger/xger_16.hpp2
-rw-r--r--src/database/kernels/xger/xger_3232.hpp4
-rw-r--r--src/database/kernels/xger/xger_64.hpp2
-rw-r--r--src/database/kernels/xger/xger_6464.hpp2
-rw-r--r--src/routine.cpp2
-rw-r--r--src/routines/level3/xgemm.cpp6
-rw-r--r--src/routines/levelx/xgemmbatched.cpp2
-rw-r--r--src/tuning/routines/xgemm.cpp166
-rw-r--r--src/utilities/timing.hpp123
-rw-r--r--test/diagnostics.cpp14
-rw-r--r--test/routines/level3/xgemm.hpp2
56 files changed, 659 insertions, 232 deletions
diff --git a/CHANGELOG b/CHANGELOG
index ed27291e..02bee585 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -9,6 +9,7 @@ Development (next version)
- Kernels are now cached based on their tuning parameters: fits the use-case of 'OverrideParameters'
- Cross-compiling for Android is now supported using CMake; instructions are added to the README
- Improved performance for small GEMM problems by going from 3 to 1 optional temporary buffers
+- GEMM kernel selection (direct vs in-direct) is now done automatically using a new tuner
- Various minor fixes and enhancements
- Added tuned parameters for various devices (see README)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28093980..a97a4eb0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,7 +204,9 @@ endif()
set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
xgemm xgemm_direct xgemv)
set(DATABASES copy pad padtranspose transpose xaxpy xdot
- xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger)
+ xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger
+ gemm_routine)
+set(ROUTINE_TUNERS xgemm)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
@@ -243,7 +245,6 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual
src/database/apple_cpu_fallback.hpp
src/database/database.hpp
src/database/database_structure.hpp
- src/database/kernel_selection.hpp
src/routines/level1/xamin.hpp
src/routines/level1/xmax.hpp
src/routines/level1/xmin.hpp
@@ -254,6 +255,7 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual
src/utilities/clblast_exceptions.hpp
src/utilities/device_mapping.hpp
src/utilities/msvc.hpp
+ src/utilities/timing.hpp
src/utilities/utilities.hpp
src/cache.hpp
src/cxpp11_common.hpp
@@ -388,6 +390,12 @@ if(TUNERS)
target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS})
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
endforeach()
+ foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
+ add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
+ target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
+ target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC ${CLTUNE_INCLUDE_DIRS})
+ install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
+ endforeach()
# Adds 'alltuners' target: runs all tuners for all precisions
set(ALLTUNERS )
diff --git a/README.md b/README.md
index 8321c2ce..8a0fe17a 100644
--- a/README.md
+++ b/README.md
@@ -196,6 +196,8 @@ In summary, tuning the entire library for your device can be done as follows (st
Alternatively, you can also supply your tuning parameters programmatically through the CLBlast API. This is especially useful if you tune for specific non-standard arguments (e.g. a rectangular or a very small matrix). To do so, you can call the `OverrideParameters` function which will set new parameters for a specific kernel. At the first next call of the target routine, CLBlast will compile a new binary and use it together with the new parameters from then on. Until `OverrideParameters` is called again of course. See the [API documentation](doc/clblast.md#overrideparameters-override-tuning-parameters-auxiliary-function) for more details.
+After the kernels are tuned, you can run the `clblast_tuner_routine_xgemm` tuner to optimize the high-level GEMM routine, i.e. selecting which method to use: the direct kernel or the in-direct kernel.
+
Compiling the correctness tests (optional)
-------------
diff --git a/ROADMAP.md b/ROADMAP.md
index 13e45add..4209c239 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -6,7 +6,9 @@ This file gives an overview of the main features planned for addition to CLBlast
| Issue# | When | Who | Status | What |
| -----------|-------------|-----------|--------|---------------|
| - | Oct '17 | CNugteren | ✔ | CUDA API for CLBlast |
-| [#169](https://github.com/CNugteren/CLBlast/issues/169), [#195](https://github.com/CNugteren/CLBlast/issues/195) | Oct-Nov '17 | CNugteren | | Auto-tuning the kernel selection parameter |
-| [#181](https://github.com/CNugteren/CLBlast/issues/181), [#201](https://github.com/CNugteren/CLBlast/issues/201) | Nov '17 | CNugteren | | Compilation for Android and testing on Qualcomm Adreno |
+| [#169](https://github.com/CNugteren/CLBlast/issues/169), [#195](https://github.com/CNugteren/CLBlast/issues/195) | Oct-Nov '17 | CNugteren | ✔ | Auto-tuning the kernel selection parameter |
+| [#181](https://github.com/CNugteren/CLBlast/issues/181), [#201](https://github.com/CNugteren/CLBlast/issues/201) | Nov '17 | CNugteren | ✔ | Compilation for Android and testing on a device |
+| - | Nov '17 | CNugteren | | Integration of CLTune for easy testing on Android / fewer dependencies |
| [#128](https://github.com/CNugteren/CLBlast/issues/128), [#205](https://github.com/CNugteren/CLBlast/issues/205) | Nov-Dec '17 | CNugteren | | Pre-processor for loop unrolling and array-to-register-promotion for e.g. ARM Mali |
+| [#207](https://github.com/CNugteren/CLBlast/issues/207) | Dec '17 | CNugteren | | Tuning of the TRSM/TRSV routines |
| [#169](https://github.com/CNugteren/CLBlast/issues/169) | '17 | dividiti | | Problem-specific tuning parameter selection |
diff --git a/scripts/database/database/bests.py b/scripts/database/database/bests.py
index c924efde..8ea8b48a 100644
--- a/scripts/database/database/bests.py
+++ b/scripts/database/database/bests.py
@@ -38,17 +38,17 @@ def get_relative_bests(name, common_results, common_parameters, verbose=False):
"""Retrieves the parameters with the relative best execution time over different devices"""
# Helper function
- def argmax(iterable):
- return max(enumerate(iterable), key=lambda x: x[1])[0]
+ def argmin(iterable):
+ return min(enumerate(iterable), key=lambda x: x[1])[0]
# Computes the sum of the execution times over the different devices
performance_sums = []
for parameters in common_parameters:
- performance_sum = sum([r["relative_performance"] for r in common_results if r["parameters"] == parameters])
+ performance_sum = sum([r["relative_time"] for r in common_results if r["parameters"] == parameters])
performance_sums.append(performance_sum)
- # Retrieves the entry with the highest performance
- best_index = argmax(performance_sums)
+ # Retrieves the entry with the lowest time
+ best_index = argmin(performance_sums)
best_performance = performance_sums[best_index]
best_parameters = common_parameters[best_index]
diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py
index 428bfdda..2b4f734c 100644
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@@ -23,7 +23,8 @@ DEVICE_TYPE_ATTRIBUTES = ["clblast_device_vendor", "clblast_device_type"]
DEVICE_ATTRIBUTES = ["clblast_device_name", "clblast_device_architecture",
"device_core_clock", "device_compute_units"]
KERNEL_ATTRIBUTES = ["precision", "kernel_family"]
-ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
+ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta",
+ "arg_from", "arg_to", "arg_step"]
ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES
GROUP_ATTRIBUTES = DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ["kernel"] + ARGUMENT_ATTRIBUTES
diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
index 6042c374..d9e58253 100644
--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@@ -62,7 +62,7 @@ def add_default_sections(database, grouping, verbose, values_dict, condition, en
assert len(group) > 0
if condition(group[0]):
- # Stores all the section's data
+ # Stores all the section's data
default_section = {}
for attribute in group[0].keys():
if attribute != "results" and attribute != "group_identifier":
@@ -180,7 +180,8 @@ def get_common_best_parameters(group, group_identifier, verbose, enable_warning)
assert len(section["results"]) > 0
minimum_time = min([result["time"] for result in section["results"]])
for result in section["results"]:
- result["relative_performance"] = minimum_time / result["time"]
+ base_line = minimum_time if section["kernel"] != "gemm_kernel_selection" else 1.0
+ result["relative_time"] = result["time"] / base_line
# Determine which parameters are available for all devices
common_parameters = get_parameter_names(group[0]) # Parameters of the first section
diff --git a/scripts/database/database/io.py b/scripts/database/database/io.py
index 722658d8..15a39cc1 100644
--- a/scripts/database/database/io.py
+++ b/scripts/database/database/io.py
@@ -65,7 +65,7 @@ def decompress_database(database):
for result in section["results"]:
parameters = {}
for name, value in zip(section["parameter_names"], result[0].split(",")):
- parameters[name] = value
+ parameters[name] = int(value)
new_result = {
"parameters": parameters,
"time": result[1]
diff --git a/src/database/database.cpp b/src/database/database.cpp
index 836c8803..2fa86151 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -30,10 +30,11 @@
#include "database/kernels/transpose/transpose.hpp"
#include "database/kernels/padtranspose/padtranspose.hpp"
+#include "database/kernels/gemm_routine/gemm_routine.hpp"
+
#include "database/kernels/xtrsv.hpp"
#include "database/kernels/invert.hpp"
#include "database/apple_cpu_fallback.hpp"
-#include "database/kernel_selection.hpp"
namespace clblast {
// =================================================================================================
@@ -54,7 +55,7 @@ const std::vector<database::DatabaseEntry> Database::database = std::vector<data
database::TransposeHalf, database::TransposeSingle, database::TransposeDouble, database::TransposeComplexSingle, database::TransposeComplexDouble,
database::PadtransposeHalf, database::PadtransposeSingle, database::PadtransposeDouble, database::PadtransposeComplexSingle, database::PadtransposeComplexDouble,
database::InvertHalf, database::InvertSingle, database::InvertDouble, database::InvertComplexSingle, database::InvertComplexDouble,
- database::KernelSelectionHalf, database::KernelSelectionSingle, database::KernelSelectionDouble, database::KernelSelectionComplexSingle, database::KernelSelectionComplexDouble
+ database::GemmRoutineHalf, database::GemmRoutineSingle, database::GemmRoutineDouble, database::GemmRoutineComplexSingle, database::GemmRoutineComplexDouble
};
const std::vector<database::DatabaseEntry> Database::apple_cpu_fallback = std::vector<database::DatabaseEntry>{
database::XaxpyApple, database::XdotApple,
diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp
deleted file mode 100644
index 6d74b9f9..00000000
--- a/src/database/kernel_selection.hpp
+++ /dev/null
@@ -1,136 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-// Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel
-// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the
-// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use
-// some common default values.
-//
-// =================================================================================================
-
-namespace clblast {
-namespace database {
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionHalf = {
- "KernelSelection", Precision::kHalf, {"XGEMM_MIN_INDIRECT_SIZE"}, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionSingle = {
- "KernelSelection", Precision::kSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- {
- kDeviceTypeGPU, "ARM", {
- { "default", { { kDeviceNameDefault, Params{ 128*128*128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionComplexSingle = {
- "KernelSelection", Precision::kComplexSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionDouble = {
- "KernelSelection", Precision::kDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- }
-};
-
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionComplexDouble = {
- "KernelSelection", Precision::kComplexDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, {
- { // Intel GPUs
- kDeviceTypeGPU, "Intel", {
- { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- { // NVIDIA GPUs
- kDeviceTypeGPU, "NVIDIA", {
- { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- { // Default
- kDeviceTypeAll, "default", {
- { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
- }
- },
- }
-};
-
-// =================================================================================================
-} // namespace database
-} // namespace clblast
diff --git a/src/database/kernels/copy/copy_64.hpp b/src/database/kernels/copy/copy_64.hpp
index 3b545a9c..d649f5ef 100644
--- a/src/database/kernels/copy/copy_64.hpp
+++ b/src/database/kernels/copy/copy_64.hpp
@@ -97,7 +97,7 @@ const DatabaseEntry CopyDouble = {
{ Name{"GeForce GTX TITAN Black "}, Params{ 16, 8, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Tesla K20m "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Tesla K40m "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 32, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "SM5.0", {
{ Name{"GeForce GTX 750 "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/copy/copy_6464.hpp b/src/database/kernels/copy/copy_6464.hpp
index 290ad051..2dfad2c6 100644
--- a/src/database/kernels/copy/copy_6464.hpp
+++ b/src/database/kernels/copy/copy_6464.hpp
@@ -124,7 +124,7 @@ const DatabaseEntry CopyComplexDouble = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/gemm_routine/gemm_routine.hpp b/src/database/kernels/gemm_routine/gemm_routine.hpp
new file mode 100644
index 00000000..f1470252
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine.hpp
@@ -0,0 +1,14 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine' kernels.
+//
+// =================================================================================================
+
+#include "database/kernels/gemm_routine/gemm_routine_16.hpp"
+#include "database/kernels/gemm_routine/gemm_routine_32.hpp"
+#include "database/kernels/gemm_routine/gemm_routine_3232.hpp"
+#include "database/kernels/gemm_routine/gemm_routine_64.hpp"
+#include "database/kernels/gemm_routine/gemm_routine_6464.hpp"
diff --git a/src/database/kernels/gemm_routine/gemm_routine_16.hpp b/src/database/kernels/gemm_routine/gemm_routine_16.hpp
new file mode 100644
index 00000000..3d849420
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_16.hpp
@@ -0,0 +1,34 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine16' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineHalf = {
+ "GemmRoutine", Precision::kHalf, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", {
+ { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", {
+ { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine_32.hpp b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
new file mode 100644
index 00000000..8a300444
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
@@ -0,0 +1,58 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine32' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineSingle = {
+ "GemmRoutine", Precision::kSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", {
+ { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ { // Intel(R) Corporation CPUs
+ kDeviceTypeCPU, "Intel(R) Corporation", {
+ { "default", {
+ { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ { // NVIDIA Corporation GPUs
+ kDeviceTypeGPU, "NVIDIA Corporation", {
+ { "SM5.0", {
+ { Name{"GeForce GTX 750 Ti "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ { "SM6.1", {
+ { Name{"GeForce GTX 1080 Ti "}, Params{ 1792, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"TITAN X (Pascal) "}, Params{ 1664, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1664, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ { "default", {
+ { kDeviceNameDefault , Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", {
+ { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
new file mode 100644
index 00000000..1861127a
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
@@ -0,0 +1,58 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine3232' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineComplexSingle = {
+ "GemmRoutine", Precision::kComplexSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", {
+ { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ { // Intel(R) Corporation CPUs
+ kDeviceTypeCPU, "Intel(R) Corporation", {
+ { "default", {
+ { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ { // NVIDIA Corporation GPUs
+ kDeviceTypeGPU, "NVIDIA Corporation", {
+ { "SM5.0", {
+ { Name{"GeForce GTX 750 Ti "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ { "SM6.1", {
+ { Name{"GeForce GTX 1080 Ti "}, Params{ 1408, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"TITAN X (Pascal) "}, Params{ 1472, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1408, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ { "default", {
+ { kDeviceNameDefault , Params{ 1152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", {
+ { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine_64.hpp b/src/database/kernels/gemm_routine/gemm_routine_64.hpp
new file mode 100644
index 00000000..840276d4
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_64.hpp
@@ -0,0 +1,50 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine64' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineDouble = {
+ "GemmRoutine", Precision::kDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+ { // Intel(R) Corporation CPUs
+ kDeviceTypeCPU, "Intel(R) Corporation", {
+ { "default", {
+ { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ { // NVIDIA Corporation GPUs
+ kDeviceTypeGPU, "NVIDIA Corporation", {
+ { "SM5.0", {
+ { Name{"GeForce GTX 750 Ti "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ { "SM6.1", {
+ { Name{"GeForce GTX 1080 Ti "}, Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"TITAN X (Pascal) "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ { "default", {
+ { kDeviceNameDefault , Params{ 704, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", {
+ { kDeviceNameDefault , Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine_6464.hpp b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp
new file mode 100644
index 00000000..36498186
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp
@@ -0,0 +1,50 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine6464' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineComplexDouble = {
+ "GemmRoutine", Precision::kComplexDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+ { // Intel(R) Corporation CPUs
+ kDeviceTypeCPU, "Intel(R) Corporation", {
+ { "default", {
+ { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ { // NVIDIA Corporation GPUs
+ kDeviceTypeGPU, "NVIDIA Corporation", {
+ { "SM5.0", {
+ { Name{"GeForce GTX 750 Ti "}, Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ { "SM6.1", {
+ { Name{"GeForce GTX 1080 Ti "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"TITAN X (Pascal) "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ { "default", {
+ { kDeviceNameDefault , Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", {
+ { kDeviceNameDefault , Params{ 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
+ }
+ },
+ }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/pad/pad_3232.hpp b/src/database/kernels/pad/pad_3232.hpp
index 06823819..2f9893bf 100644
--- a/src/database/kernels/pad/pad_3232.hpp
+++ b/src/database/kernels/pad/pad_3232.hpp
@@ -154,7 +154,7 @@ const DatabaseEntry PadComplexSingle = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/padtranspose/padtranspose_32.hpp b/src/database/kernels/padtranspose/padtranspose_32.hpp
index 4b87afb2..bdd24058 100644
--- a/src/database/kernels/padtranspose/padtranspose_32.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_32.hpp
@@ -154,7 +154,7 @@ const DatabaseEntry PadtransposeSingle = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/padtranspose/padtranspose_3232.hpp b/src/database/kernels/padtranspose/padtranspose_3232.hpp
index a810aae4..b4c6e274 100644
--- a/src/database/kernels/padtranspose/padtranspose_3232.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_3232.hpp
@@ -89,7 +89,7 @@ const DatabaseEntry PadtransposeComplexSingle = {
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/padtranspose/padtranspose_64.hpp b/src/database/kernels/padtranspose/padtranspose_64.hpp
index 84b21157..910746c6 100644
--- a/src/database/kernels/padtranspose/padtranspose_64.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_64.hpp
@@ -124,7 +124,7 @@ const DatabaseEntry PadtransposeDouble = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/transpose/transpose_3232.hpp b/src/database/kernels/transpose/transpose_3232.hpp
index a82af30d..a502ba39 100644
--- a/src/database/kernels/transpose/transpose_3232.hpp
+++ b/src/database/kernels/transpose/transpose_3232.hpp
@@ -52,7 +52,7 @@ const DatabaseEntry TransposeComplexSingle = {
{ kDeviceNameDefault , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 8, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/transpose/transpose_64.hpp b/src/database/kernels/transpose/transpose_64.hpp
index f8cf65fb..6d0ed746 100644
--- a/src/database/kernels/transpose/transpose_64.hpp
+++ b/src/database/kernels/transpose/transpose_64.hpp
@@ -124,7 +124,7 @@ const DatabaseEntry TransposeDouble = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/transpose/transpose_6464.hpp b/src/database/kernels/transpose/transpose_6464.hpp
index 89eb95a7..931d395f 100644
--- a/src/database/kernels/transpose/transpose_6464.hpp
+++ b/src/database/kernels/transpose/transpose_6464.hpp
@@ -67,7 +67,7 @@ const DatabaseEntry TransposeComplexDouble = {
{ Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 4, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
@@ -116,7 +116,7 @@ const DatabaseEntry TransposeComplexDouble = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xaxpy/xaxpy_32.hpp b/src/database/kernels/xaxpy/xaxpy_32.hpp
index cce43e24..483968b1 100644
--- a/src/database/kernels/xaxpy/xaxpy_32.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_32.hpp
@@ -140,7 +140,7 @@ const DatabaseEntry XaxpySingle = {
{ kDeviceNameDefault , Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
@@ -155,7 +155,7 @@ const DatabaseEntry XaxpySingle = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 4, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xaxpy/xaxpy_3232.hpp b/src/database/kernels/xaxpy/xaxpy_3232.hpp
index 9f6a9997..c77bcd39 100644
--- a/src/database/kernels/xaxpy/xaxpy_3232.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_3232.hpp
@@ -21,7 +21,7 @@ const DatabaseEntry XaxpyComplexSingle = {
{ "Fiji", {
{ Name{"AMD Radeon R9 Fury X "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 2, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 2, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "Hawaii", {
{ Name{"AMD Radeon R9 290X "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xaxpy/xaxpy_64.hpp b/src/database/kernels/xaxpy/xaxpy_64.hpp
index 9d03c055..2713d04f 100644
--- a/src/database/kernels/xaxpy/xaxpy_64.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_64.hpp
@@ -124,7 +124,7 @@ const DatabaseEntry XaxpyDouble = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xdot/xdot_32.hpp b/src/database/kernels/xdot/xdot_32.hpp
index 08900039..fd98d96f 100644
--- a/src/database/kernels/xdot/xdot_32.hpp
+++ b/src/database/kernels/xdot/xdot_32.hpp
@@ -48,7 +48,7 @@ const DatabaseEntry XdotSingle = {
{ kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
@@ -90,7 +90,7 @@ const DatabaseEntry XdotSingle = {
{ Name{"GeForce GT 650M "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"GeForce GTX 670 "}, Params{ 512, 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"GeForce GTX 680 "}, Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 256, 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 256, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "SM3.5", {
{ Name{"GeForce GTX TITAN Black "}, Params{ 512, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -115,7 +115,7 @@ const DatabaseEntry XdotSingle = {
{ kDeviceNameDefault , Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
@@ -130,7 +130,7 @@ const DatabaseEntry XdotSingle = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xdot/xdot_3232.hpp b/src/database/kernels/xdot/xdot_3232.hpp
index 06bb8d6e..859b20e8 100644
--- a/src/database/kernels/xdot/xdot_3232.hpp
+++ b/src/database/kernels/xdot/xdot_3232.hpp
@@ -129,7 +129,7 @@ const DatabaseEntry XdotComplexSingle = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xdot/xdot_6464.hpp b/src/database/kernels/xdot/xdot_6464.hpp
index 4fcf9026..c897ef53 100644
--- a/src/database/kernels/xdot/xdot_6464.hpp
+++ b/src/database/kernels/xdot/xdot_6464.hpp
@@ -53,7 +53,7 @@ const DatabaseEntry XdotComplexDouble = {
{ Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
@@ -93,7 +93,7 @@ const DatabaseEntry XdotComplexDouble = {
{ kDeviceNameDefault , Params{ 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemm/xgemm_32.hpp b/src/database/kernels/xgemm/xgemm_32.hpp
index a4221046..752ce146 100644
--- a/src/database/kernels/xgemm/xgemm_32.hpp
+++ b/src/database/kernels/xgemm/xgemm_32.hpp
@@ -21,7 +21,7 @@ const DatabaseEntry XgemmSingle = {
{ "Fiji", {
{ Name{"AMD Radeon R9 Fury X "}, Params{ 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 4, 4 } },
{ Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 32, 2, 16, 16, 64, 8, 16, 128, 0, 0, 0, 0, 2, 8 } },
- { kDeviceNameDefault , Params{ 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 2, 2 } },
+ { kDeviceNameDefault , Params{ 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 2, 4 } },
} },
{ "Hawaii", {
{ Name{"AMD Radeon R9 290X "}, Params{ 16, 2, 16, 32, 128, 32, 8, 64, 1, 1, 1, 1, 4, 2 } },
@@ -52,7 +52,7 @@ const DatabaseEntry XgemmSingle = {
{ kDeviceNameDefault , Params{ 32, 2, 8, 16, 128, 8, 8, 128, 0, 0, 1, 1, 8, 8 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 4 } },
+ { kDeviceNameDefault , Params{ 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
} },
}
},
@@ -130,7 +130,7 @@ const DatabaseEntry XgemmSingle = {
{ "SM5.2", {
{ Name{"GeForce GTX 980 "}, Params{ 16, 2, 16, 16, 64, 16, 8, 128, 1, 1, 1, 0, 4, 8 } },
{ Name{"GeForce GTX TITAN X "}, Params{ 16, 2, 8, 16, 128, 8, 8, 128, 1, 1, 1, 1, 4, 8 } },
- { kDeviceNameDefault , Params{ 16, 2, 16, 16, 128, 16, 8, 128, 1, 1, 1, 0, 4, 8 } },
+ { kDeviceNameDefault , Params{ 16, 2, 8, 16, 64, 8, 8, 128, 1, 1, 1, 0, 4, 8 } },
} },
{ "SM6.1", {
{ Name{"GeForce GTX 1070 "}, Params{ 16, 2, 32, 16, 128, 32, 8, 128, 1, 1, 1, 0, 4, 1 } },
@@ -140,7 +140,7 @@ const DatabaseEntry XgemmSingle = {
{ kDeviceNameDefault , Params{ 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 2 } },
+ { kDeviceNameDefault , Params{ 32, 2, 8, 8, 32, 32, 32, 64, 1, 1, 0, 0, 4, 2 } },
} },
}
},
@@ -155,7 +155,7 @@ const DatabaseEntry XgemmSingle = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 2 } },
+ { kDeviceNameDefault , Params{ 32, 2, 8, 8, 32, 16, 16, 32, 1, 1, 0, 0, 4, 2 } },
} },
}
},
diff --git a/src/database/kernels/xgemm/xgemm_3232.hpp b/src/database/kernels/xgemm/xgemm_3232.hpp
index 110a2f2e..ed7a2254 100644
--- a/src/database/kernels/xgemm/xgemm_3232.hpp
+++ b/src/database/kernels/xgemm/xgemm_3232.hpp
@@ -21,7 +21,7 @@ const DatabaseEntry XgemmComplexSingle = {
{ "Fiji", {
{ Name{"AMD Radeon R9 Fury X "}, Params{ 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 1, 2 } },
{ Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 32, 2, 32, 32, 64, 8, 8, 64, 0, 0, 1, 1, 2, 8 } },
- { kDeviceNameDefault , Params{ 32, 2, 8, 8, 16, 32, 32, 32, 1, 1, 0, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 32, 2, 16, 16, 64, 8, 8, 64, 0, 0, 0, 0, 2, 4 } },
} },
{ "Hawaii", {
{ Name{"AMD Radeon R9 290X "}, Params{ 32, 2, 32, 8, 32, 8, 16, 32, 1, 0, 1, 0, 1, 1 } },
@@ -106,13 +106,13 @@ const DatabaseEntry XgemmComplexSingle = {
{ "SM2.0", {
{ Name{"GeForce GTX 480 "}, Params{ 16, 2, 16, 16, 32, 32, 16, 128, 0, 1, 1, 1, 2, 2 } },
{ Name{"GeForce GTX 580 "}, Params{ 32, 2, 16, 8, 32, 32, 32, 128, 1, 0, 1, 0, 1, 1 } },
- { kDeviceNameDefault , Params{ 16, 2, 16, 16, 32, 32, 16, 128, 0, 0, 1, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 16, 2, 16, 8, 32, 32, 16, 128, 0, 0, 1, 0, 1, 1 } },
} },
{ "SM3.0", {
{ Name{"GRID K520 "}, Params{ 16, 8, 32, 32, 64, 32, 16, 128, 1, 0, 1, 0, 1, 4 } },
{ Name{"GeForce GTX 670 "}, Params{ 16, 2, 32, 32, 64, 32, 8, 32, 1, 1, 1, 1, 1, 1 } },
{ Name{"GeForce GTX 680 "}, Params{ 16, 2, 32, 16, 64, 32, 32, 128, 1, 0, 0, 0, 2, 2 } },
- { kDeviceNameDefault , Params{ 16, 2, 32, 16, 64, 32, 16, 128, 1, 0, 0, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 16, 2, 32, 16, 64, 32, 8, 32, 1, 0, 0, 0, 1, 1 } },
} },
{ "SM3.5", {
{ Name{"GeForce GTX TITAN "}, Params{ 16, 8, 16, 16, 64, 32, 16, 64, 1, 1, 1, 0, 1, 1 } },
@@ -124,19 +124,19 @@ const DatabaseEntry XgemmComplexSingle = {
{ "SM5.0", {
{ Name{"GeForce GTX 750 "}, Params{ 16, 8, 16, 16, 64, 16, 16, 64, 1, 1, 1, 0, 2, 2 } },
{ Name{"GeForce GTX 750 Ti "}, Params{ 16, 2, 16, 8, 32, 32, 16, 64, 1, 1, 1, 0, 1, 2 } },
- { kDeviceNameDefault , Params{ 16, 2, 16, 16, 32, 16, 16, 64, 1, 1, 1, 0, 1, 2 } },
+ { kDeviceNameDefault , Params{ 16, 2, 16, 8, 32, 16, 16, 64, 1, 1, 1, 0, 1, 2 } },
} },
{ "SM5.2", {
{ Name{"GeForce GTX 980 "}, Params{ 32, 8, 32, 32, 64, 16, 16, 64, 1, 1, 1, 0, 2, 1 } },
{ Name{"GeForce GTX TITAN X "}, Params{ 16, 2, 8, 8, 64, 8, 8, 32, 1, 0, 1, 1, 1, 4 } },
- { kDeviceNameDefault , Params{ 16, 2, 32, 32, 64, 16, 16, 32, 1, 0, 1, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 16, 2, 8, 8, 64, 8, 8, 32, 1, 0, 1, 0, 1, 1 } },
} },
{ "SM6.1", {
{ Name{"GeForce GTX 1070 "}, Params{ 16, 2, 16, 16, 128, 16, 16, 64, 1, 1, 1, 1, 2, 4 } },
{ Name{"GeForce GTX 1080 "}, Params{ 16, 2, 32, 16, 64, 32, 8, 64, 1, 1, 0, 0, 1, 2 } },
{ Name{"GeForce GTX 1080 Ti "}, Params{ 16, 2, 8, 16, 32, 16, 8, 64, 1, 1, 0, 0, 1, 1 } },
{ Name{"TITAN X (Pascal) "}, Params{ 32, 2, 32, 32, 64, 8, 8, 32, 1, 1, 0, 0, 2, 4 } },
- { kDeviceNameDefault , Params{ 32, 2, 8, 8, 16, 16, 16, 32, 1, 1, 0, 0, 2, 2 } },
+ { kDeviceNameDefault , Params{ 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 2, 4 } },
} },
{ "default", {
{ kDeviceNameDefault , Params{ 32, 2, 8, 8, 16, 32, 32, 64, 1, 1, 0, 0, 1, 1 } },
@@ -146,7 +146,7 @@ const DatabaseEntry XgemmComplexSingle = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 2, 1 } },
+ { kDeviceNameDefault , Params{ 32, 2, 8, 8, 32, 8, 8, 32, 0, 0, 0, 0, 4, 4 } },
} },
}
},
diff --git a/src/database/kernels/xgemm/xgemm_64.hpp b/src/database/kernels/xgemm/xgemm_64.hpp
index b17aea7b..3efab164 100644
--- a/src/database/kernels/xgemm/xgemm_64.hpp
+++ b/src/database/kernels/xgemm/xgemm_64.hpp
@@ -21,7 +21,7 @@ const DatabaseEntry XgemmDouble = {
{ "Fiji", {
{ Name{"AMD Radeon R9 Fury X "}, Params{ 32, 2, 16, 16, 32, 16, 16, 32, 1, 1, 0, 0, 2, 2 } },
{ Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 32, 2, 16, 16, 64, 8, 8, 32, 0, 0, 0, 0, 4, 4 } },
- { kDeviceNameDefault , Params{ 32, 2, 8, 8, 16, 32, 32, 32, 1, 1, 0, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 32, 2, 8, 8, 32, 8, 8, 32, 0, 0, 0, 0, 4, 4 } },
} },
{ "Hawaii", {
{ Name{"AMD Radeon R9 290X "}, Params{ 16, 8, 32, 8, 128, 8, 8, 32, 0, 1, 0, 0, 1, 4 } },
@@ -67,7 +67,7 @@ const DatabaseEntry XgemmDouble = {
{ Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 32, 2, 16, 8, 128, 8, 8, 128, 1, 0, 0, 0, 2, 8 } },
{ Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 32, 2, 8, 16, 128, 16, 8, 128, 0, 0, 1, 1, 1, 8 } },
{ Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 32, 2, 8, 16, 64, 16, 8, 64, 0, 1, 1, 0, 1, 4 } },
- { kDeviceNameDefault , Params{ 32, 2, 32, 32, 32, 16, 16, 64, 1, 1, 0, 0, 1, 4 } },
+ { kDeviceNameDefault , Params{ 32, 2, 8, 8, 64, 8, 8, 64, 0, 0, 0, 0, 1, 2 } },
} },
}
},
@@ -84,30 +84,30 @@ const DatabaseEntry XgemmDouble = {
{ "SM2.0", {
{ Name{"GeForce GTX 480 "}, Params{ 16, 2, 8, 16, 32, 32, 8, 64, 1, 1, 1, 0, 1, 2 } },
{ Name{"GeForce GTX 580 "}, Params{ 32, 2, 32, 16, 64, 8, 8, 32, 0, 1, 1, 1, 1, 4 } },
- { kDeviceNameDefault , Params{ 16, 2, 32, 16, 32, 32, 8, 32, 0, 1, 1, 0, 1, 2 } },
+ { kDeviceNameDefault , Params{ 16, 2, 8, 16, 32, 8, 8, 32, 0, 1, 1, 0, 1, 2 } },
} },
{ "SM3.0", {
{ Name{"GRID K520 "}, Params{ 16, 2, 8, 8, 16, 8, 8, 32, 1, 0, 0, 1, 2, 2 } },
{ Name{"GeForce GTX 670 "}, Params{ 32, 8, 16, 32, 128, 16, 8, 32, 0, 1, 1, 0, 1, 1 } },
{ Name{"GeForce GTX 680 "}, Params{ 32, 8, 8, 8, 32, 16, 32, 128, 1, 0, 0, 1, 2, 4 } },
- { kDeviceNameDefault , Params{ 16, 2, 16, 32, 128, 16, 32, 128, 0, 0, 0, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 16, 2, 8, 8, 16, 8, 8, 32, 0, 0, 0, 0, 1, 1 } },
} },
{ "SM3.5", {
{ Name{"GeForce GTX TITAN "}, Params{ 16, 8, 16, 8, 32, 16, 32, 128, 1, 1, 1, 1, 2, 2 } },
{ Name{"GeForce GTX TITAN Black "}, Params{ 16, 2, 16, 8, 16, 16, 8, 16, 1, 1, 1, 0, 1, 1 } },
{ Name{"Tesla K20m "}, Params{ 16, 2, 32, 8, 32, 16, 16, 64, 1, 0, 0, 0, 1, 1 } },
{ Name{"Tesla K40m "}, Params{ 32, 2, 16, 8, 64, 16, 32, 128, 1, 0, 1, 1, 2, 4 } },
- { kDeviceNameDefault , Params{ 16, 2, 16, 8, 16, 16, 16, 128, 1, 0, 0, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 16, 2, 16, 8, 16, 16, 8, 16, 1, 0, 0, 0, 1, 1 } },
} },
{ "SM5.0", {
{ Name{"GeForce GTX 750 "}, Params{ 32, 8, 16, 32, 64, 16, 8, 128, 0, 0, 0, 1, 2, 1 } },
{ Name{"GeForce GTX 750 Ti "}, Params{ 32, 2, 8, 8, 32, 16, 16, 32, 0, 0, 0, 0, 4, 2 } },
- { kDeviceNameDefault , Params{ 32, 2, 16, 32, 32, 16, 16, 128, 0, 0, 0, 0, 2, 1 } },
+ { kDeviceNameDefault , Params{ 32, 2, 8, 8, 32, 16, 8, 32, 0, 0, 0, 0, 2, 1 } },
} },
{ "SM5.2", {
{ Name{"GeForce GTX 980 "}, Params{ 32, 8, 16, 8, 64, 32, 32, 128, 0, 0, 1, 0, 2, 4 } },
{ Name{"GeForce GTX TITAN X "}, Params{ 16, 8, 16, 16, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } },
- { kDeviceNameDefault , Params{ 16, 8, 16, 16, 16, 16, 16, 128, 0, 0, 0, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 16, 8, 16, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } },
} },
{ "SM6.1", {
{ Name{"GeForce GTX 1070 "}, Params{ 16, 2, 8, 16, 32, 8, 8, 64, 0, 0, 1, 1, 2, 8 } },
@@ -124,7 +124,7 @@ const DatabaseEntry XgemmDouble = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 32, 2, 32, 32, 32, 8, 8, 32, 1, 1, 0, 0, 1, 4 } },
+ { kDeviceNameDefault , Params{ 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 2, 2 } },
} },
}
},
diff --git a/src/database/kernels/xgemm/xgemm_6464.hpp b/src/database/kernels/xgemm/xgemm_6464.hpp
index 6d28ab77..57df2480 100644
--- a/src/database/kernels/xgemm/xgemm_6464.hpp
+++ b/src/database/kernels/xgemm/xgemm_6464.hpp
@@ -84,13 +84,13 @@ const DatabaseEntry XgemmComplexDouble = {
{ "SM2.0", {
{ Name{"GeForce GTX 480 "}, Params{ 16, 2, 32, 32, 32, 32, 8, 32, 0, 0, 1, 0, 1, 1 } },
{ Name{"GeForce GTX 580 "}, Params{ 32, 2, 32, 32, 32, 8, 8, 64, 0, 0, 0, 0, 1, 2 } },
- { kDeviceNameDefault , Params{ 16, 2, 32, 32, 32, 32, 8, 32, 0, 0, 0, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 16, 2, 32, 32, 32, 8, 8, 32, 0, 0, 0, 0, 1, 1 } },
} },
{ "SM3.0", {
{ Name{"GRID K520 "}, Params{ 32, 8, 16, 16, 16, 8, 16, 64, 1, 0, 1, 1, 1, 1 } },
{ Name{"GeForce GTX 670 "}, Params{ 32, 8, 16, 8, 16, 16, 32, 64, 1, 0, 0, 1, 1, 2 } },
{ Name{"GeForce GTX 680 "}, Params{ 16, 8, 16, 8, 64, 16, 32, 32, 0, 1, 1, 0, 1, 1 } },
- { kDeviceNameDefault , Params{ 16, 8, 16, 16, 16, 16, 16, 32, 0, 0, 0, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 16, 8, 16, 8, 16, 8, 16, 32, 0, 0, 0, 0, 1, 1 } },
} },
{ "SM3.5", {
{ Name{"GeForce GTX TITAN Black "}, Params{ 16, 2, 16, 16, 32, 16, 8, 32, 0, 1, 1, 1, 1, 1 } },
@@ -101,12 +101,12 @@ const DatabaseEntry XgemmComplexDouble = {
{ "SM5.0", {
{ Name{"GeForce GTX 750 "}, Params{ 32, 2, 8, 32, 32, 8, 8, 64, 0, 0, 1, 0, 1, 4 } },
{ Name{"GeForce GTX 750 Ti "}, Params{ 32, 2, 8, 8, 16, 8, 8, 32, 0, 0, 0, 0, 1, 1 } },
- { kDeviceNameDefault , Params{ 32, 2, 8, 32, 16, 8, 8, 32, 0, 0, 0, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 32, 2, 8, 8, 16, 8, 8, 32, 0, 0, 0, 0, 1, 1 } },
} },
{ "SM5.2", {
{ Name{"GeForce GTX 980 "}, Params{ 16, 2, 16, 8, 32, 8, 16, 128, 0, 0, 1, 1, 2, 2 } },
{ Name{"GeForce GTX TITAN X "}, Params{ 32, 8, 16, 16, 128, 16, 16, 32, 0, 0, 1, 0, 1, 1 } },
- { kDeviceNameDefault , Params{ 16, 2, 16, 16, 128, 16, 16, 128, 0, 0, 1, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 16, 2, 8, 8, 32, 8, 16, 32, 0, 0, 1, 0, 1, 1 } },
} },
{ "SM6.1", {
{ Name{"GeForce GTX 1070 "}, Params{ 32, 8, 32, 16, 32, 8, 8, 32, 0, 0, 0, 1, 1, 4 } },
@@ -116,14 +116,14 @@ const DatabaseEntry XgemmComplexDouble = {
{ kDeviceNameDefault , Params{ 32, 2, 32, 32, 32, 32, 32, 64, 0, 0, 0, 0, 1, 2 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 32, 2, 16, 16, 16, 16, 16, 32, 0, 0, 0, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 32, 2, 16, 16, 16, 8, 8, 32, 0, 0, 0, 0, 1, 1 } },
} },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 32, 2, 16, 16, 32, 16, 16, 64, 0, 0, 0, 0, 2, 2 } },
+ { kDeviceNameDefault , Params{ 32, 2, 32, 32, 32, 8, 8, 32, 1, 1, 0, 0, 1, 2 } },
} },
}
},
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
index 7458d0b6..fd5d2a76 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
@@ -45,6 +45,7 @@ const DatabaseEntry XgemmDirectSingle = {
{ "default", {
{ Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 2, 8, 8, 8, 8, 0, 0, 1, 8, 64, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 8, 16, 16, 16, 16, 0, 0, 1, 1, 64, 0, 0, 0, 0 } },
+ { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 2, 32, 32, 32, 32, 0, 0, 1, 1, 64, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 16, 16, 8, 8, 8, 0, 0, 2, 4, 32, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 2, 8, 8, 8, 8, 0, 0, 2, 2, 64, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 2, 8, 8, 16, 8, 0, 0, 4, 4, 64, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
index 4242743a..883a4b0a 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
@@ -41,10 +41,11 @@ const DatabaseEntry XgemmDirectComplexSingle = {
{ "default", {
{ Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 2, 8, 8, 8, 8, 0, 0, 4, 4, 32, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 2, 16, 8, 16, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0 } },
+ { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 8, 8, 8, 8, 8, 0, 0, 1, 1, 8, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 4, 32, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 2, 8, 8, 16, 8, 1, 1, 2, 1, 32, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0 } },
} },
}
},
@@ -53,7 +54,7 @@ const DatabaseEntry XgemmDirectComplexSingle = {
{ "default", {
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
{ Name{"Iris Pro "}, Params{ 2, 16, 16, 8, 8, 1, 1, 2, 2, 32, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 2, 16, 16, 16, 16, 1, 1, 1, 1, 16, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp
index 14d4ccae..adc3f408 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp
@@ -37,10 +37,11 @@ const DatabaseEntry XgemmDirectDouble = {
{ "default", {
{ Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0 } },
+ { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 8, 8, 8, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 8, 8, 8, 8, 8, 0, 1, 1, 1, 8, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 4, 32, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0 } },
} },
}
},
@@ -72,7 +73,7 @@ const DatabaseEntry XgemmDirectDouble = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 2, 2, 32, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp
index ef6940ee..23ae923e 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp
@@ -37,6 +37,7 @@ const DatabaseEntry XgemmDirectComplexDouble = {
{ "default", {
{ Name{"Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz "}, Params{ 2, 8, 8, 32, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz "}, Params{ 2, 16, 16, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0 } },
+ { Name{"Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz "}, Params{ 2, 8, 8, 8, 8, 0, 0, 2, 2, 32, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz "}, Params{ 8, 16, 16, 8, 8, 0, 0, 2, 1, 32, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 8, 16, 8, 8, 8, 0, 0, 2, 2, 32, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 2, 32, 8, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemv/xgemv_32.hpp b/src/database/kernels/xgemv/xgemv_32.hpp
index 471273d2..2c886bbb 100644
--- a/src/database/kernels/xgemv/xgemv_32.hpp
+++ b/src/database/kernels/xgemv/xgemv_32.hpp
@@ -52,7 +52,7 @@ const DatabaseEntry XgemvSingle = {
{ kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
@@ -146,7 +146,7 @@ const DatabaseEntry XgemvSingle = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemv/xgemv_3232.hpp b/src/database/kernels/xgemv/xgemv_3232.hpp
index 3b6bb1e8..d09ce003 100644
--- a/src/database/kernels/xgemv/xgemv_3232.hpp
+++ b/src/database/kernels/xgemv/xgemv_3232.hpp
@@ -120,7 +120,7 @@ const DatabaseEntry XgemvComplexSingle = {
{ Name{"GeForce GTX 1080 "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"GeForce GTX 1080 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"TITAN X (Pascal) "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "default", {
{ kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemv/xgemv_64.hpp b/src/database/kernels/xgemv/xgemv_64.hpp
index 3f27e5c8..1b6b796e 100644
--- a/src/database/kernels/xgemv/xgemv_64.hpp
+++ b/src/database/kernels/xgemv/xgemv_64.hpp
@@ -115,7 +115,7 @@ const DatabaseEntry XgemvDouble = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp
index 72e2de2b..976a8cd2 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp
@@ -21,7 +21,7 @@ const DatabaseEntry XgemvFastDouble = {
{ "Fiji", {
{ Name{"AMD Radeon R9 Fury X "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"AMD Radeon R9 M370X Compute Engine "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "Hawaii", {
{ Name{"AMD Radeon R9 290X "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
index 690b0a3f..73627a53 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
@@ -36,7 +36,7 @@ const DatabaseEntry XgemvFastRotSingle = {
{ kDeviceNameDefault , Params{ 8, 128, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
index 52a57fb3..7a20b95e 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
@@ -36,7 +36,7 @@ const DatabaseEntry XgemvFastRotComplexSingle = {
{ kDeviceNameDefault , Params{ 8, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 8, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xger/xger_16.hpp b/src/database/kernels/xger/xger_16.hpp
index 8d261835..dae1a675 100644
--- a/src/database/kernels/xger/xger_16.hpp
+++ b/src/database/kernels/xger/xger_16.hpp
@@ -43,7 +43,7 @@ const DatabaseEntry XgerHalf = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xger/xger_3232.hpp b/src/database/kernels/xger/xger_3232.hpp
index f214e889..769815ef 100644
--- a/src/database/kernels/xger/xger_3232.hpp
+++ b/src/database/kernels/xger/xger_3232.hpp
@@ -52,7 +52,7 @@ const DatabaseEntry XgerComplexSingle = {
{ kDeviceNameDefault , Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
@@ -74,7 +74,7 @@ const DatabaseEntry XgerComplexSingle = {
{ Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 256, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 512, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 256, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 128, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 128, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xger/xger_64.hpp b/src/database/kernels/xger/xger_64.hpp
index 08bf96c9..56e4fc3a 100644
--- a/src/database/kernels/xger/xger_64.hpp
+++ b/src/database/kernels/xger/xger_64.hpp
@@ -101,7 +101,7 @@ const DatabaseEntry XgerDouble = {
{ kDeviceNameDefault , Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "default", {
- { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xger/xger_6464.hpp b/src/database/kernels/xger/xger_6464.hpp
index d1202ce4..78a70e36 100644
--- a/src/database/kernels/xger/xger_6464.hpp
+++ b/src/database/kernels/xger/xger_6464.hpp
@@ -66,7 +66,7 @@ const DatabaseEntry XgerComplexDouble = {
{ Name{"Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz "}, Params{ 512, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz "}, Params{ 256, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz "}, Params{ 256, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 128, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/routine.cpp b/src/routine.cpp
index db9bafea..81201eea 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -43,7 +43,7 @@ const std::unordered_map<std::string, const std::vector<std::string>> Routine::r
{"Padtranspose", routines_gemm_syrk},
{"Xgemm", routines_gemm_syrk},
{"XgemmDirect", routines_gemm},
- {"KernelSelection", routines_gemm},
+ {"GemmRoutine", routines_gemm},
{"Invert", routines_trsm},
};
// =================================================================================================
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index a0063ee2..94392dd0 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -23,7 +23,7 @@ namespace clblast {
template <typename T>
Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name,
- {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
+ {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"},
PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
@@ -104,7 +104,9 @@ void Xgemm<T>::DoGemm(const Layout layout,
// Selects which version of GEMM to run
const auto m_n_k = static_cast<unsigned long long>(m) * static_cast<unsigned long long>(n) *
static_cast<unsigned long long>(k);
- const auto do_gemm_direct = (m_n_k < static_cast<unsigned long long>(db_["XGEMM_MIN_INDIRECT_SIZE"]));
+ const auto database_value = static_cast<unsigned long long>(db_["XGEMM_MIN_INDIRECT_SIZE"]);
+ const auto min_indirect_size = database_value * database_value * database_value;
+ const auto do_gemm_direct = (m_n_k < min_indirect_size);
if (do_gemm_direct) { // for small sizes (single kernel)
GemmDirect(m, n, k, alpha,
a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp
index 8a015e97..152e7194 100644
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@@ -23,7 +23,7 @@ namespace clblast {
template <typename T>
XgemmBatched<T>::XgemmBatched(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name,
- {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
+ {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"},
PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
new file mode 100644
index 00000000..a880c97e
--- /dev/null
+++ b/src/tuning/routines/xgemm.cpp
@@ -0,0 +1,166 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file tunes the Xgemm routine at a high-level: choosing between the direct (single-kernel)
+// and the in-direct (kernel plus pre/post-processing) methods.
+//
+// =================================================================================================
+
+#include <exception>
+#include <string>
+#include <vector>
+#include <assert.h>
+
+#include "utilities/utilities.hpp"
+#include "utilities/timing.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename T>
+void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Gemm(Layout::kRowMajor, Transpose::kNo, Transpose::kNo,
+ value, value, value, ConstantOne<T>(),
+ buffers[0](), 0, value,
+ buffers[1](), 0, value, ConstantOne<T>(),
+ buffers[2](), 0, value,
+ &queue_plain, &event);
+ if (status != StatusCode::kSuccess) {
+ throw RuntimeError("Gemm failed with status " + ToString(status));
+ }
+ clWaitForEvents(1, &event);
+ clReleaseEvent(event);
+}
+
+template <typename T>
+void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device) {
+ const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue<T>(),
+ {{"XGEMM_MIN_INDIRECT_SIZE", minimum_size}});
+ if (override_status != StatusCode::kSuccess) {
+ throw RuntimeError("OverrideParameters failed with status " + ToString(override_status));
+ }
+}
+
+template <typename T>
+void TuneXgemm(int argc, char* argv[]) {
+ auto command_line_args = RetrieveCommandLineArguments(argc, argv);
+ auto help = std::string{"* Options given/available:\n"};
+ const auto platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}));
+ const auto device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}));
+ const auto precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle);
+ const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, size_t{10});
+ fprintf(stdout, "%s\n", help.c_str());
+
+ // Values for m, n, and k
+ const auto from = size_t{64};
+ const auto to = size_t{2048};
+ const auto step = size_t{64};
+
+ // OpenCL initialisation
+ const auto platform = Platform(platform_id);
+ const auto device = Device(platform, device_id);
+ if (!PrecisionSupported<T>(device)) {
+ printf("* Unsupported precision, skipping this tuning run\n\n");
+ return;
+ }
+ const auto context = Context(device);
+ const auto queue = Queue(context, device);
+
+ // Buffers
+ auto a_mat = Buffer<T>(context, to * to);
+ auto b_mat = Buffer<T>(context, to * to);
+ auto c_mat = Buffer<T>(context, to * to);
+ auto buffers = std::vector<Buffer<T>>{a_mat, b_mat, c_mat};
+
+ // In-direct version
+ printf("[----------] Testing the in-direct GEMM routine for m=n=k\n");
+ ForceSelectIndirectFrom<T>(0, device);
+ const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
+
+ // Direct version
+ printf("[----------] Testing the direct GEMM routine for m=n=k\n");
+ ForceSelectIndirectFrom<T>(to * to * to + 1, device);
+ const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
+
+ // Determining final score and best kernel selection point
+ assert(indirect.size() == direct.size());
+ printf("[----------] Collecting results\n");
+ auto ratios = std::vector<double>(indirect.size());
+ for (auto i = size_t{0}; i < indirect.size(); ++i) {
+ ratios[i] = indirect[i].second / direct[i].second;
+ }
+ auto scores = std::vector<TuningResult>(ratios.size());
+ for (auto i = size_t{0}; i < scores.size(); ++i) {
+ auto score = 0;
+ for (auto j = size_t{0}; j < i; ++j) { score += (ratios[j] <= 1.0); }
+ for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); }
+ const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones
+ const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1);
+ scores[i] = TuningResult{
+ "gemm_kernel_selection",
+ (relative_score * relative_score) * 100 + epsilon, // squared for proper default computation
+ TuningParameters{
+ TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first},
+ TuningParameter{"PRECISION", static_cast<size_t>(precision)}
+ }
+ };
+ }
+
+ // Displaying results
+ printf("[ -------> ] value indirect direct score (lowest means best switching point)\n");
+ for (auto i = size_t{0}; i < indirect.size(); ++i) {
+ assert(indirect[i].first == direct[i].first);
+ const auto value = indirect[i].first;
+ if (indirect[i].second != -1 && direct[i].second != -1) {
+ const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6);
+ const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6);
+ printf("[ -------> ] %7zu %8.2lf %8.2lf %8.2lf\n",
+ value, gflops_indirect, gflops_direct, scores[i].score);
+ }
+ }
+
+ // Outputs the results as JSON to disk, including some meta-data
+ const auto precision_string = std::to_string(static_cast<size_t>(precision));
+ auto metadata = std::vector<std::pair<std::string,std::string>>{
+ {"kernel_family", "gemm_routine"},
+ {"arg_from", ToString(from)},
+ {"arg_to", ToString(to)},
+ {"arg_step", ToString(step)},
+ {"precision", precision_string},
+ };
+ PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json",
+ device, platform, metadata, scores);
+
+ printf("[ STATUS ] All done\n");
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using half = clblast::half;
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+ const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
+ switch(clblast::GetPrecision(command_line_args)) {
+ case clblast::Precision::kHalf: clblast::TuneXgemm<half>(argc, argv); break;
+ case clblast::Precision::kSingle: clblast::TuneXgemm<float>(argc, argv); break;
+ case clblast::Precision::kDouble: clblast::TuneXgemm<double>(argc, argv); break;
+ case clblast::Precision::kComplexSingle: clblast::TuneXgemm<float2>(argc, argv); break;
+ case clblast::Precision::kComplexDouble: clblast::TuneXgemm<double2>(argc, argv); break;
+ }
+ return 0;
+}
+
+// =================================================================================================
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
new file mode 100644
index 00000000..bfad6147
--- /dev/null
+++ b/src/utilities/timing.hpp
@@ -0,0 +1,123 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides helper functions for time measurement and such.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TIMING_H_
+#define CLBLAST_TIMING_H_
+
+#include <cstdio>
+#include <utility>
+#include <vector>
+#include <algorithm>
+#include <chrono>
+
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename F>
+double TimeFunction(const size_t num_runs, F const &function) {
+ function(); // warm-up
+ auto timings = std::vector<double>(num_runs);
+ for (auto &timing: timings) {
+ const auto start_time = std::chrono::steady_clock::now();
+ function();
+ const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+ timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+ }
+ return *std::min_element(timings.begin(), timings.end());
+}
+
+// =================================================================================================
+
+using Timing = std::pair<size_t, double>;
+
+template <typename T, typename F>
+std::vector<Timing> TimeRoutine(const size_t from, const size_t to, const size_t step,
+ const size_t num_runs, const Queue& queue,
+ const std::vector<Buffer<T>>& buffers, F const &routine) {
+ auto timings = std::vector<Timing>();
+ for (auto value = from; value < to; value += step) {
+ printf("[ RUN ] Running with value %zu\n", value);
+ try {
+ const auto FunctionToTune = [&]() { routine(value, queue, buffers); };
+ const auto time_ms = TimeFunction(num_runs, FunctionToTune);
+ printf("[ OK ] Took %.2lf ms\n", time_ms);
+ timings.push_back({value, time_ms});
+ }
+ catch (...) {
+ printf("[ ERROR ] Exception caught\n");
+ timings.push_back({value, -1.0}); // invalid
+ }
+ }
+ return timings;
+}
+
+// =================================================================================================
+
+using TuningParameter = std::pair<std::string, size_t>;
+using TuningParameters = std::vector<TuningParameter>;
+struct TuningResult { std::string name; double score; TuningParameters parameters; };
+
+void PrintTimingsToFileAsJSON(const std::string &filename,
+ const Device& device, const Platform& platform,
+ const std::vector<std::pair<std::string,std::string>> &metadata,
+ const std::vector<TuningResult>& tuning_results) {
+ printf("[ STATUS ] Writing results to '%s'\n", filename.c_str());
+ auto file = fopen(filename.c_str(), "w");
+ fprintf(file, "{\n");
+ for (auto &datum: metadata) {
+ fprintf(file, " \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
+ }
+ fprintf(file, " \"platform_version\": \"%s\",\n", platform.Version().c_str());
+ fprintf(file, " \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
+ fprintf(file, " \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
+ fprintf(file, " \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
+ fprintf(file, " \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
+ fprintf(file, " \"device_core_clock\": \"%zu\",\n", device.CoreClock());
+ fprintf(file, " \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
+ fprintf(file, " \"results\": [\n");
+
+ // Loops over all results
+ auto num_results = tuning_results.size();
+ for (auto r = size_t{0}; r < num_results; ++r) {
+ auto result = tuning_results[r];
+ fprintf(file, " {\n");
+ fprintf(file, " \"kernel\": \"%s\",\n", result.name.c_str());
+ fprintf(file, " \"time\": %.3lf,\n", result.score);
+
+ // Loops over all the parameters for this result
+ fprintf(file, " \"parameters\": {");
+ auto num_configs = result.parameters.size();
+ for (auto p=size_t{0}; p<num_configs; ++p) {
+ auto config = result.parameters[p];
+ fprintf(file, "\"%s\": %zu", config.first.c_str(), config.second);
+ if (p < num_configs-1) { fprintf(file, ","); }
+ }
+ fprintf(file, "}\n");
+
+ // The footer
+ fprintf(file, " }");
+ if (r < num_results - 1) { fprintf(file, ","); }
+ fprintf(file, "\n");
+ }
+ fprintf(file, " ]\n");
+ fprintf(file, "}\n");
+ fclose(file);
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TIMING_H_
+#endif
diff --git a/test/diagnostics.cpp b/test/diagnostics.cpp
index af56cd30..b7204fe8 100644
--- a/test/diagnostics.cpp
+++ b/test/diagnostics.cpp
@@ -15,24 +15,12 @@
#include <chrono>
#include <algorithm>
+#include "utilities/timing.hpp"
#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
-template<typename F>
-double TimeFunction(const size_t num_runs, F const &function) {
- auto timings = std::vector<double>(num_runs);
- for (auto &timing: timings) {
- const auto start_time = std::chrono::steady_clock::now();
- function();
- const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
- timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
- }
- return *std::min_element(timings.begin(), timings.end());
-
-}
-
void OpenCLDiagnostics(int argc, char *argv[]) {
auto arguments = RetrieveCommandLineArguments(argc, argv);
diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp
index 8444c1c3..fe8cf7b9 100644
--- a/test/routines/level3/xgemm.hpp
+++ b/test/routines/level3/xgemm.hpp
@@ -86,7 +86,7 @@ class TestXgemm {
if (V != 0) {
const auto device = queue.GetDevice();
const auto switch_threshold = (V == 1) ? size_t{0} : size_t{1024 * 1024 * 1024}; // large enough for tests
- const auto override_status = OverrideParameters(device(), "KernelSelection", PrecisionValue<T>(),
+ const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue<T>(),
{{"XGEMM_MIN_INDIRECT_SIZE", switch_threshold}});
if (override_status != StatusCode::kSuccess) { return override_status; }
}