summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG10
-rw-r--r--CMakeLists.txt8
-rw-r--r--LICENSE27
-rw-r--r--README.md9
-rw-r--r--doc/performance/GeForce_GTX480/SAXPY.pdfbin13225 -> 0 bytes
-rw-r--r--doc/performance/GeForce_GTX480/SGEMM.pdfbin13198 -> 0 bytes
-rw-r--r--doc/performance/GeForce_GTX480/SGEMV.pdfbin13739 -> 0 bytes
-rw-r--r--doc/performance/GeForce_GTX480/SSYMM.pdfbin13132 -> 0 bytes
-rw-r--r--doc/performance/GeForce_GTX750Ti/SAXPY.pdfbin0 -> 13273 bytes
-rw-r--r--doc/performance/GeForce_GTX750Ti/SGEMM.pdfbin0 -> 13153 bytes
-rw-r--r--doc/performance/GeForce_GTX750Ti/SGEMV.pdfbin0 -> 13678 bytes
-rw-r--r--doc/performance/GeForce_GTX750Ti/SSYMM.pdfbin0 -> 13092 bytes
-rw-r--r--doc/performance/Intel_IrisPro/SAXPY.pdfbin13361 -> 13267 bytes
-rw-r--r--doc/performance/Intel_IrisPro/SGEMM.pdfbin13099 -> 13037 bytes
-rw-r--r--doc/performance/Intel_IrisPro/SGEMV.pdfbin13626 -> 13592 bytes
-rw-r--r--doc/performance/Radeon_M370X/SGEMM.pdfbin13326 -> 13246 bytes
-rw-r--r--doc/performance/Radeon_M370X/SGEMV.pdfbin13701 -> 13667 bytes
-rw-r--r--doc/performance/Radeon_M370X/SSYMM.pdfbin13228 -> 13217 bytes
-rw-r--r--include/clblast_half.h5
-rw-r--r--samples/cache.c6
-rw-r--r--samples/dgemv.c6
-rw-r--r--samples/haxpy.c6
-rw-r--r--samples/sasum.c6
-rw-r--r--samples/sgemm.c6
-rw-r--r--samples/sgemm.cpp6
-rwxr-xr-xscripts/database/database.py2
-rw-r--r--scripts/graphs/common.r35
-rw-r--r--scripts/graphs/xgemm_small.r56
-rw-r--r--scripts/graphs/xsymm.r46
-rw-r--r--scripts/graphs/xsyrk.r46
-rw-r--r--src/clpp11.hpp48
-rw-r--r--src/database/database.cpp23
-rw-r--r--src/database/database.hpp35
-rw-r--r--src/database/kernel_selection.hpp129
-rw-r--r--src/database/kernels/copy.hpp12
-rw-r--r--src/database/kernels/pad.hpp12
-rw-r--r--src/database/kernels/padtranspose.hpp4
-rw-r--r--src/database/kernels/transpose.hpp6
-rw-r--r--src/database/kernels/xaxpy.hpp8
-rw-r--r--src/database/kernels/xdot.hpp6
-rw-r--r--src/database/kernels/xgemm.hpp12
-rw-r--r--src/database/kernels/xgemm_direct.hpp136
-rw-r--r--src/database/kernels/xgemv.hpp8
-rw-r--r--src/database/kernels/xgemv_fast.hpp9
-rw-r--r--src/database/kernels/xgemv_fast_rot.hpp8
-rw-r--r--src/database/kernels/xger.hpp10
-rw-r--r--src/kernels/common.opencl2
-rw-r--r--src/kernels/level3/xgemm_direct_part1.opencl273
-rw-r--r--src/kernels/level3/xgemm_direct_part2.opencl314
-rw-r--r--src/kernels/level3/xgemm_direct_part3.opencl214
-rw-r--r--src/msvc.hpp39
-rw-r--r--src/routine.cpp10
-rw-r--r--src/routines/level3/xgemm.cpp108
-rw-r--r--src/routines/level3/xgemm.hpp23
-rw-r--r--src/tuning/kernels/copy_fast.cpp1
-rw-r--r--src/tuning/kernels/copy_pad.cpp1
-rw-r--r--src/tuning/kernels/transpose_fast.cpp1
-rw-r--r--src/tuning/kernels/transpose_pad.cpp1
-rw-r--r--src/tuning/kernels/xaxpy.cpp1
-rw-r--r--src/tuning/kernels/xdot.cpp1
-rw-r--r--src/tuning/kernels/xgemm.cpp7
-rw-r--r--src/tuning/kernels/xgemm_direct.cpp196
-rw-r--r--src/tuning/kernels/xgemv.cpp1
-rw-r--r--src/tuning/kernels/xger.cpp1
-rw-r--r--src/tuning/tuning.hpp18
-rw-r--r--src/utilities.cpp20
-rw-r--r--src/utilities.hpp6
-rw-r--r--test/correctness/testblas.cpp14
-rw-r--r--test/correctness/testblas.hpp1
-rw-r--r--test/correctness/tester.cpp26
-rw-r--r--test/performance/client.cpp14
-rw-r--r--test/performance/client.hpp1
-rw-r--r--test/routines/level1/xamax.hpp2
-rw-r--r--test/routines/level1/xasum.hpp2
-rw-r--r--test/routines/level1/xaxpy.hpp2
-rw-r--r--test/routines/level1/xcopy.hpp2
-rw-r--r--test/routines/level1/xdot.hpp2
-rw-r--r--test/routines/level1/xdotc.hpp2
-rw-r--r--test/routines/level1/xdotu.hpp2
-rw-r--r--test/routines/level1/xnrm2.hpp2
-rw-r--r--test/routines/level1/xscal.hpp2
-rw-r--r--test/routines/level1/xswap.hpp2
-rw-r--r--test/routines/level2/xgbmv.hpp2
-rw-r--r--test/routines/level2/xgemv.hpp2
-rw-r--r--test/routines/level2/xger.hpp2
-rw-r--r--test/routines/level2/xgerc.hpp2
-rw-r--r--test/routines/level2/xgeru.hpp2
-rw-r--r--test/routines/level2/xhbmv.hpp2
-rw-r--r--test/routines/level2/xhemv.hpp2
-rw-r--r--test/routines/level2/xher.hpp2
-rw-r--r--test/routines/level2/xher2.hpp2
-rw-r--r--test/routines/level2/xhpmv.hpp2
-rw-r--r--test/routines/level2/xhpr.hpp2
-rw-r--r--test/routines/level2/xhpr2.hpp2
-rw-r--r--test/routines/level2/xsbmv.hpp2
-rw-r--r--test/routines/level2/xspmv.hpp2
-rw-r--r--test/routines/level2/xspr.hpp2
-rw-r--r--test/routines/level2/xspr2.hpp2
-rw-r--r--test/routines/level2/xsymv.hpp2
-rw-r--r--test/routines/level2/xsyr.hpp2
-rw-r--r--test/routines/level2/xsyr2.hpp2
-rw-r--r--test/routines/level2/xtbmv.hpp2
-rw-r--r--test/routines/level2/xtpmv.hpp2
-rw-r--r--test/routines/level2/xtrmv.hpp2
-rw-r--r--test/routines/level3/xgemm.hpp2
-rw-r--r--test/routines/level3/xhemm.hpp2
-rw-r--r--test/routines/level3/xher2k.hpp2
-rw-r--r--test/routines/level3/xherk.hpp2
-rw-r--r--test/routines/level3/xsymm.hpp2
-rw-r--r--test/routines/level3/xsyr2k.hpp2
-rw-r--r--test/routines/level3/xsyrk.hpp2
-rw-r--r--test/routines/level3/xtrmm.hpp2
-rw-r--r--test/routines/levelx/xomatcopy.hpp2
113 files changed, 1878 insertions, 244 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 1995dc84..0d517869 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,14 @@
+Development version (next release)
+- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
+- Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation
+- Fixed a bug in the tests and samples related to waiting for an invalid event
+- Added support for compilation under Visual Studio 2013 (MSVC++ 12.0)
+- Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS
+- Added an option to run tuned kernels multiple times to average execution times
+- Various minor fixes and enhancements
+- Added tuned parameters for various devices (see README)
+
Version 0.9.0
- Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header
- Improved performance significantly of rotated GEMV computations
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b422743f..9b028448 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,7 +135,8 @@ endif()
# ==================================================================================================
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
-set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv)
+set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
+ xgemm xgemm_direct xgemv)
set(SAMPLE_PROGRAMS_CPP sgemm)
set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
@@ -278,8 +279,9 @@ if(CLIENTS OR TESTS)
set(REF_INCLUDES )
set(REF_LIBRARIES )
if(CLBLAS_FOUND)
+ find_package(Threads)
+ set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
- set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
add_definitions(" /DCLBLAST_REF_CLBLAS")
else()
@@ -362,7 +364,7 @@ if(TESTS)
test/correctness/tester.cpp test/correctness/testblas.cpp)
target_include_directories(test_correctness_common PUBLIC
$<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
- ${clblast_SOURCE_DIR})
+ ${clblast_SOURCE_DIR} ${REF_INCLUDES})
set(TESTS_COMMON ${TESTS_COMMON} $<TARGET_OBJECTS:test_correctness_common>)
endif()
diff --git a/LICENSE b/LICENSE
index ae43189f..75f63024 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,14 +1,21 @@
+MIT License
-Copyright (c) 2015 Cedric Nugteren
+Copyright (c) 2016 Cedric Nugteren
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
- http://www.apache.org/licenses/LICENSE-2.0
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 7f6a3d96..33282d8f 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ The pre-requisites for compilation of CLBlast are:
- Clang 3.3 or newer
- AppleClang 5.0 or newer
- ICC 14.0 or newer
- - MSVC (Visual Studio) 2015 or newer
+ - MSVC (Visual Studio) 2013 or newer
* An OpenCL 1.1 or newer library, for example:
- Apple OpenCL
- NVIDIA CUDA SDK
@@ -90,6 +90,8 @@ Afterwards, any of CLBlast's routines can be called directly: there is no need t
cmake -DSAMPLES=ON ..
+Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
+
Using the tuners (optional)
-------------
@@ -117,8 +119,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- Tahiti
* Intel GPUs:
- HD Graphics 530
- - HD Graphics Haswell Ultrabook GT2 Mobile
- HD Graphics 5500 BroadWell U-Processor GT2
+ - HD Graphics Haswell Ultrabook GT2 Mobile
+ - HD Graphics IvyBridge M GT2
- HD Graphics Skylake ULT GT2
- Iris
- Iris Pro
@@ -134,7 +137,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
cmake -DTUNERS=ON ..
-Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher).
+Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.5.0 or higher).
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
diff --git a/doc/performance/GeForce_GTX480/SAXPY.pdf b/doc/performance/GeForce_GTX480/SAXPY.pdf
deleted file mode 100644
index 6e1c8f5a..00000000
--- a/doc/performance/GeForce_GTX480/SAXPY.pdf
+++ /dev/null
Binary files differ
diff --git a/doc/performance/GeForce_GTX480/SGEMM.pdf b/doc/performance/GeForce_GTX480/SGEMM.pdf
deleted file mode 100644
index f430f880..00000000
--- a/doc/performance/GeForce_GTX480/SGEMM.pdf
+++ /dev/null
Binary files differ
diff --git a/doc/performance/GeForce_GTX480/SGEMV.pdf b/doc/performance/GeForce_GTX480/SGEMV.pdf
deleted file mode 100644
index 8cb57124..00000000
--- a/doc/performance/GeForce_GTX480/SGEMV.pdf
+++ /dev/null
Binary files differ
diff --git a/doc/performance/GeForce_GTX480/SSYMM.pdf b/doc/performance/GeForce_GTX480/SSYMM.pdf
deleted file mode 100644
index ff5941ad..00000000
--- a/doc/performance/GeForce_GTX480/SSYMM.pdf
+++ /dev/null
Binary files differ
diff --git a/doc/performance/GeForce_GTX750Ti/SAXPY.pdf b/doc/performance/GeForce_GTX750Ti/SAXPY.pdf
new file mode 100644
index 00000000..531baa79
--- /dev/null
+++ b/doc/performance/GeForce_GTX750Ti/SAXPY.pdf
Binary files differ
diff --git a/doc/performance/GeForce_GTX750Ti/SGEMM.pdf b/doc/performance/GeForce_GTX750Ti/SGEMM.pdf
new file mode 100644
index 00000000..dcd62929
--- /dev/null
+++ b/doc/performance/GeForce_GTX750Ti/SGEMM.pdf
Binary files differ
diff --git a/doc/performance/GeForce_GTX750Ti/SGEMV.pdf b/doc/performance/GeForce_GTX750Ti/SGEMV.pdf
new file mode 100644
index 00000000..a4c3efb3
--- /dev/null
+++ b/doc/performance/GeForce_GTX750Ti/SGEMV.pdf
Binary files differ
diff --git a/doc/performance/GeForce_GTX750Ti/SSYMM.pdf b/doc/performance/GeForce_GTX750Ti/SSYMM.pdf
new file mode 100644
index 00000000..43d97d24
--- /dev/null
+++ b/doc/performance/GeForce_GTX750Ti/SSYMM.pdf
Binary files differ
diff --git a/doc/performance/Intel_IrisPro/SAXPY.pdf b/doc/performance/Intel_IrisPro/SAXPY.pdf
index 3a51f306..8d639b24 100644
--- a/doc/performance/Intel_IrisPro/SAXPY.pdf
+++ b/doc/performance/Intel_IrisPro/SAXPY.pdf
Binary files differ
diff --git a/doc/performance/Intel_IrisPro/SGEMM.pdf b/doc/performance/Intel_IrisPro/SGEMM.pdf
index 15f1714f..31725025 100644
--- a/doc/performance/Intel_IrisPro/SGEMM.pdf
+++ b/doc/performance/Intel_IrisPro/SGEMM.pdf
Binary files differ
diff --git a/doc/performance/Intel_IrisPro/SGEMV.pdf b/doc/performance/Intel_IrisPro/SGEMV.pdf
index e1660999..9ec120c4 100644
--- a/doc/performance/Intel_IrisPro/SGEMV.pdf
+++ b/doc/performance/Intel_IrisPro/SGEMV.pdf
Binary files differ
diff --git a/doc/performance/Radeon_M370X/SGEMM.pdf b/doc/performance/Radeon_M370X/SGEMM.pdf
index 5dca8f03..da5722f9 100644
--- a/doc/performance/Radeon_M370X/SGEMM.pdf
+++ b/doc/performance/Radeon_M370X/SGEMM.pdf
Binary files differ
diff --git a/doc/performance/Radeon_M370X/SGEMV.pdf b/doc/performance/Radeon_M370X/SGEMV.pdf
index fa661249..513318bf 100644
--- a/doc/performance/Radeon_M370X/SGEMV.pdf
+++ b/doc/performance/Radeon_M370X/SGEMV.pdf
Binary files differ
diff --git a/doc/performance/Radeon_M370X/SSYMM.pdf b/doc/performance/Radeon_M370X/SSYMM.pdf
index 852181d1..03efd198 100644
--- a/doc/performance/Radeon_M370X/SSYMM.pdf
+++ b/doc/performance/Radeon_M370X/SSYMM.pdf
Binary files differ
diff --git a/include/clblast_half.h b/include/clblast_half.h
index 269a520e..05d96f9f 100644
--- a/include/clblast_half.h
+++ b/include/clblast_half.h
@@ -25,6 +25,11 @@
#include <CL/opencl.h>
#endif
+// MSVC 2013 doesn't fully support C99
+#ifdef _MSC_VER
+ #define inline __inline
+#endif
+
// =================================================================================================
// Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type,
diff --git a/samples/cache.c b/samples/cache.c
index a592824d..abc8ad4b 100644
--- a/samples/cache.c
+++ b/samples/cache.c
@@ -112,8 +112,10 @@ void run_example_routine(const cl_device_id device) {
&queue, &event);
// Wait for completion
- clWaitForEvents(1, &event);
- clReleaseEvent(event);
+ if (status == kSuccess) {
+ clWaitForEvents(1, &event);
+ clReleaseEvent(event);
+ }
// Retrieves the execution time
clock_t diff = clock() - start;
diff --git a/samples/dgemv.c b/samples/dgemv.c
index c22c9f37..a15d649a 100644
--- a/samples/dgemv.c
+++ b/samples/dgemv.c
@@ -84,8 +84,10 @@ int main(void) {
&queue, &event);
// Wait for completion
- clWaitForEvents(1, &event);
- clReleaseEvent(event);
+ if (status == kSuccess) {
+ clWaitForEvents(1, &event);
+ clReleaseEvent(event);
+ }
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed DGEMV with status %d\n", status);
diff --git a/samples/haxpy.c b/samples/haxpy.c
index d5b98e12..5bab3d42 100644
--- a/samples/haxpy.c
+++ b/samples/haxpy.c
@@ -77,8 +77,10 @@ int main(void) {
&queue, &event);
// Wait for completion
- clWaitForEvents(1, &event);
- clReleaseEvent(event);
+ if (status == kSuccess) {
+ clWaitForEvents(1, &event);
+ clReleaseEvent(event);
+ }
// Copies the result back to the host
clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
diff --git a/samples/sasum.c b/samples/sasum.c
index 1518cc13..02f924b0 100644
--- a/samples/sasum.c
+++ b/samples/sasum.c
@@ -73,8 +73,10 @@ int main(void) {
&queue, &event);
// Wait for completion
- clWaitForEvents(1, &event);
- clReleaseEvent(event);
+ if (status == kSuccess) {
+ clWaitForEvents(1, &event);
+ clReleaseEvent(event);
+ }
// Copies the result back to the host
clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
diff --git a/samples/sgemm.c b/samples/sgemm.c
index b4827777..583fc261 100644
--- a/samples/sgemm.c
+++ b/samples/sgemm.c
@@ -87,8 +87,10 @@ int main(void) {
&queue, &event);
// Wait for completion
- clWaitForEvents(1, &event);
- clReleaseEvent(event);
+ if (status == kSuccess) {
+ clWaitForEvents(1, &event);
+ clReleaseEvent(event);
+ }
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed SGEMM with status %d\n", status);
diff --git a/samples/sgemm.cpp b/samples/sgemm.cpp
index a4b89968..401ecff8 100644
--- a/samples/sgemm.cpp
+++ b/samples/sgemm.cpp
@@ -95,8 +95,10 @@ int main() {
&queue_plain, &event);
// Record the execution time
- clWaitForEvents(1, &event);
- clReleaseEvent(event);
+ if (status == clblast::StatusCode::kSuccess) {
+ clWaitForEvents(1, &event);
+ clReleaseEvent(event);
+ }
auto elapsed_time = std::chrono::steady_clock::now() - start_time;
auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
diff --git a/scripts/database/database.py b/scripts/database/database.py
index f758a2b7..31f313da 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -18,7 +18,7 @@ import database.bests as bests
import database.defaults as defaults
# Server storing a copy of the database
-DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.json"
+DATABASE_SERVER_URL = "https://raw.githubusercontent.com/CNugteren/CLBlast-database/master/database.json"
# OpenCL vendor names and their short name
VENDOR_TRANSLATION_TABLE = {
diff --git a/scripts/graphs/common.r b/scripts/graphs/common.r
index cd68cf26..e5dad616 100644
--- a/scripts/graphs/common.r
+++ b/scripts/graphs/common.r
@@ -31,8 +31,12 @@ options("width"=170)
# ==================================================================================================
-# Constants
-num_runs <- 4
+# Settings
+num_runs <- 5
+num_runs_short <- 50
+xtics_subset_threshold <- 100
+xtics_subset_stepsize <- 8
+
devices <- c("-platform","-device")
options_string <- "-q -no_abbrv -cblas 0"
library_names <- c("CLBlast", "clBLAS")
@@ -66,11 +70,21 @@ main <- function(routine_name, precision, test_names, test_values,
executable <- paste("./clblast_client_", routine_name, sep="")
# Configures the outputfile
- pdf(paste(display_name, ".pdf", sep=""), height=8, width=13)
- par(mfrow=c(2, 3))
- par(oma=c(0, 0, 0, 0))
- par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
- par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+ file_name <- paste(display_name, ".pdf", sep="")
+ if (length(test_names) == 6) {
+ pdf(file_name, height=8, width=13)
+ par(mfrow=c(2, 3))
+ par(oma=c(0, 0, 0, 0))
+ par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
+ par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+ }
+ else { # length(test_names) == 2
+ pdf(file_name, height=8, width=13)
+ par(mfrow=c(2, 1))
+ par(oma=c(0, 0, 0, 0))
+ par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
+ par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+ }
# Loops over the test-cases
for (test_id in 1:length(test_names)) {
@@ -169,7 +183,12 @@ plot_graph <- function(xdata, ydata, log_setting,
main="", xlab="", ylab="",
ylim=c(ymin, ymax), xlim=c(xmin, xmax), axes=F, "n")
axis(side=2, las=2)
- axis(side=1, at=xdata, labels=xtics, las=2)
+ if (length(xdata) > xtics_subset_threshold) { # Too many indices to print, plot only every Nth
+ subset <- seq(from=1, to=length(xdata), by=xtics_subset_stepsize)
+ axis(side=1, at=xdata[subset], labels=xtics[subset], las=2)
+ } else {
+ axis(side=1, at=xdata, labels=xtics, las=2)
+ }
title(xlab=xlabel, line=-1)
title(ylab=ylabel, line=2)
title(graph_title, line=-2)
diff --git a/scripts/graphs/xgemm_small.r b/scripts/graphs/xgemm_small.r
new file mode 100644
index 00000000..ef94ef20
--- /dev/null
+++ b/scripts/graphs/xgemm_small.r
@@ -0,0 +1,56 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+# Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for small sizes of Xgemm, testing the direct kernel
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xgemm"
+parameters <- c("-m","-n","-k","-layout","-transA","-transB",
+ "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+ "small matrices in steps of 16",
+ "small matrices in steps of 1"
+)
+
+# Defines the test-cases
+test_values <- list(
+ list(c( 128, 128, 128, 102, 111, 111, 57, 16, num_runs_short, precision)),
+ list(c( 128, 128, 128, 102, 111, 111, 385, 1, num_runs_short, precision))
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+ "matrix sizes (m=n=k)",
+ "matrix sizes (m=n=k)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+ c("m", ""),
+ c("m", "")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+ test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
+
+# ================================================================================================== \ No newline at end of file
diff --git a/scripts/graphs/xsymm.r b/scripts/graphs/xsymm.r
index a65bb16f..89d137d2 100644
--- a/scripts/graphs/xsymm.r
+++ b/scripts/graphs/xsymm.r
@@ -35,32 +35,32 @@ test_names <- list(
# Defines the test-cases
test_values <- list(
- list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)),
- list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)),
- list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)),
- list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
+ list(c( 128, 128, 102, 141, 121, 16, 128, num_runs, precision)),
+ list(c( 129, 129, 102, 141, 121, 16, 128, num_runs, precision)),
+ list(c( 512, 512, 102, 141, 121, 16, 1, num_runs, precision)),
+ list(c(2048, 2048, 102, 141, 121, 16, 1, num_runs, precision)),
list(
- c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
- c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
- c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
- c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
- c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
- c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
- c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
- c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
+ c(1024, 1024, 101, 141, 121, 1, 0, num_runs, precision),
+ c(1024, 1024, 101, 141, 122, 1, 0, num_runs, precision),
+ c(1024, 1024, 101, 142, 121, 1, 0, num_runs, precision),
+ c(1024, 1024, 101, 142, 122, 1, 0, num_runs, precision),
+ c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
+ c(1024, 1024, 102, 141, 122, 1, 0, num_runs, precision),
+ c(1024, 1024, 102, 142, 121, 1, 0, num_runs, precision),
+ c(1024, 1024, 102, 142, 122, 1, 0, num_runs, precision)
),
list(
- c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision),
- c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision),
- c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision),
- c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision),
- c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision),
- c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision),
- c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision),
- c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
- c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
- c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
- c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
+ c( 8, 8, 102, 141, 121, 1, 0, num_runs, precision),
+ c( 16, 16, 102, 141, 121, 1, 0, num_runs, precision),
+ c( 32, 32, 102, 141, 121, 1, 0, num_runs, precision),
+ c( 64, 64, 102, 141, 121, 1, 0, num_runs, precision),
+ c( 128, 128, 102, 141, 121, 1, 0, num_runs, precision),
+ c( 256, 256, 102, 141, 121, 1, 0, num_runs, precision),
+ c( 512, 512, 102, 141, 121, 1, 0, num_runs, precision),
+ c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
+ c(2048, 2048, 102, 141, 121, 1, 0, num_runs, precision),
+ c(4096, 4096, 102, 141, 121, 1, 0, num_runs, precision),
+ c(8192, 8192, 102, 141, 121, 1, 0, num_runs, precision)
)
)
diff --git a/scripts/graphs/xsyrk.r b/scripts/graphs/xsyrk.r
index 4ab46c9f..754c93e2 100644
--- a/scripts/graphs/xsyrk.r
+++ b/scripts/graphs/xsyrk.r
@@ -35,32 +35,32 @@ test_names <- list(
# Defines the test-cases
test_values <- list(
- list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)),
- list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)),
- list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)),
- list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
+ list(c( 128, 128, 102, 121, 111, 16, 128, num_runs, precision)),
+ list(c( 129, 129, 102, 121, 111, 16, 128, num_runs, precision)),
+ list(c( 512, 512, 102, 121, 111, 16, 1, num_runs, precision)),
+ list(c(2048, 2048, 102, 121, 111, 16, 1, num_runs, precision)),
list(
- c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
- c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
- c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
- c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
- c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
- c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
- c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
- c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
+ c(1024, 1024, 101, 121, 111, 1, 0, num_runs, precision),
+ c(1024, 1024, 101, 121, 112, 1, 0, num_runs, precision),
+ c(1024, 1024, 101, 122, 111, 1, 0, num_runs, precision),
+ c(1024, 1024, 101, 122, 112, 1, 0, num_runs, precision),
+ c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
+ c(1024, 1024, 102, 121, 112, 1, 0, num_runs, precision),
+ c(1024, 1024, 102, 122, 111, 1, 0, num_runs, precision),
+ c(1024, 1024, 102, 122, 112, 1, 0, num_runs, precision)
),
list(
- c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision),
- c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision),
- c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision),
- c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision),
- c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision),
- c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision),
- c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision),
- c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
- c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
- c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
- c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
+ c( 8, 8, 102, 121, 111, 1, 0, num_runs, precision),
+ c( 16, 16, 102, 121, 111, 1, 0, num_runs, precision),
+ c( 32, 32, 102, 121, 111, 1, 0, num_runs, precision),
+ c( 64, 64, 102, 121, 111, 1, 0, num_runs, precision),
+ c( 128, 128, 102, 121, 111, 1, 0, num_runs, precision),
+ c( 256, 256, 102, 121, 111, 1, 0, num_runs, precision),
+ c( 512, 512, 102, 121, 111, 1, 0, num_runs, precision),
+ c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
+ c(2048, 2048, 102, 121, 111, 1, 0, num_runs, precision),
+ c(4096, 4096, 102, 121, 111, 1, 0, num_runs, precision),
+ c(8192, 8192, 102, 121, 111, 1, 0, num_runs, precision)
)
)
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index d57223dd..aaa76cb4 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -12,8 +12,8 @@
// Portability here means that a similar header exists for CUDA with the same classes and
// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
//
-// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore
-// contains the following header copyright notice:
+// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
+// therefore contains the following header copyright notice:
//
// =================================================================================================
//
@@ -97,14 +97,12 @@ class Event {
// http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
float GetElapsedTime() const {
WaitForCompletion();
- auto bytes = size_t{0};
- clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
- auto time_start = size_t{0};
+ const auto bytes = sizeof(cl_ulong);
+ auto time_start = cl_ulong{0};
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
- clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
- auto time_end = size_t{0};
+ auto time_end = cl_ulong{0};
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
- return (time_end - time_start) * 1.0e-6f;
+ return static_cast<float>(time_end - time_start) * 1.0e-6f;
}
// Accessor to the private data-member
@@ -152,6 +150,17 @@ class Platform {
cl_platform_id platform_;
};
+// Retrieves a vector with all platforms
+inline std::vector<Platform> GetAllPlatforms() {
+ auto num_platforms = cl_uint{0};
+ CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
+ auto all_platforms = std::vector<Platform>();
+ for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
+ all_platforms.push_back(Platform(platform_id));
+ }
+ return all_platforms;
+}
+
// =================================================================================================
// C++11 version of 'cl_device_id'
@@ -201,8 +210,8 @@ class Device {
std::vector<size_t> MaxWorkItemSizes() const {
return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
}
- cl_ulong LocalMemSize() const {
- return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
+ unsigned long LocalMemSize() const {
+ return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
}
std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
size_t CoreClock() const {
@@ -238,9 +247,11 @@ class Device {
// Query for a specific type of device or brand
bool IsCPU() const { return Type() == "CPU"; }
bool IsGPU() const { return Type() == "GPU"; }
- bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+ bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." ||
+ Vendor() == "AuthenticAMD";; }
bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
- bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
+ bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" ||
+ Vendor() == "GenuineIntel"; }
bool IsARM() const { return Vendor() == "ARM"; }
// Accessor to the private data-member
@@ -606,8 +617,7 @@ class Buffer {
// Retrieves the actual allocated size in bytes
size_t GetSize() const {
- auto bytes = size_t{0};
- CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes));
+ const auto bytes = sizeof(size_t);
auto result = size_t{0};
CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
return result;
@@ -658,17 +668,16 @@ class Kernel {
}
// Retrieves the amount of local memory used per work-group for this kernel
- cl_ulong LocalMemUsage(const Device &device) const {
- auto bytes = size_t{0};
+ unsigned long LocalMemUsage(const Device &device) const {
+ const auto bytes = sizeof(cl_ulong);
auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
- CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
auto result = cl_ulong{0};
CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
- return result;
+ return static_cast<unsigned long>(result);
}
// Retrieves the name of the kernel
- std::string GetFunctionName() {
+ std::string GetFunctionName() const {
auto bytes = size_t{0};
CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
auto result = std::string{};
@@ -689,6 +698,7 @@ class Kernel {
void Launch(const Queue &queue, const std::vector<size_t> &global,
const std::vector<size_t> &local, EventPointer event,
const std::vector<Event> &waitForEvents) {
+
// Builds a plain version of the events waiting list
auto waitForEventsPlain = std::vector<cl_event>();
for (auto &waitEvent : waitForEvents) {
diff --git a/src/database/database.cpp b/src/database/database.cpp
index 34c44a29..1198cefb 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -21,10 +21,12 @@
#include "database/kernels/xgemv_fast_rot.hpp"
#include "database/kernels/xger.hpp"
#include "database/kernels/xgemm.hpp"
+#include "database/kernels/xgemm_direct.hpp"
#include "database/kernels/copy.hpp"
#include "database/kernels/pad.hpp"
#include "database/kernels/transpose.hpp"
#include "database/kernels/padtranspose.hpp"
+#include "database/kernel_selection.hpp"
namespace clblast {
// =================================================================================================
@@ -38,10 +40,29 @@ const std::vector<Database::DatabaseEntry> Database::database = {
XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
+ XgemmDirectHalf, XgemmDirectSingle, XgemmDirectDouble, XgemmDirectComplexSingle, XgemmDirectComplexDouble,
CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
- PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
+ PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble,
+ KernelSelectionHalf, KernelSelectionSingle, KernelSelectionDouble, KernelSelectionComplexSingle, KernelSelectionComplexDouble
+};
+
+// The OpenCL device types
+const std::string Database::kDeviceTypeCPU = "CPU";
+const std::string Database::kDeviceTypeGPU = "GPU";
+const std::string Database::kDeviceTypeAccelerator = "accelerator";
+const std::string Database::kDeviceTypeAll = "default";
+
+// The OpenCL device vendors
+const std::string Database::kDeviceVendorAll = "default";
+
+// Alternative names for some OpenCL vendors
+const std::unordered_map<std::string, std::string> Database::kVendorNames{
+ { "Intel(R) Corporation", "Intel" },
+ { "GenuineIntel", "Intel" },
+ { "Advanced Micro Devices, Inc.", "AMD" },
+ { "NVIDIA Corporation", "NVIDIA" },
};
// =================================================================================================
diff --git a/src/database/database.hpp b/src/database/database.hpp
index a6ab49c5..346fe089 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -36,36 +36,31 @@ class Database {
// Structures for content inside the database
struct DatabaseDevice {
- const std::string name;
- const Parameters parameters;
+ std::string name;
+ Parameters parameters;
};
struct DatabaseVendor {
- const std::string type;
- const std::string name;
- const std::vector<DatabaseDevice> devices;
+ std::string type;
+ std::string name;
+ std::vector<DatabaseDevice> devices;
};
struct DatabaseEntry {
- const std::string kernel;
- const Precision precision;
- const std::vector<DatabaseVendor> vendors;
+ std::string kernel;
+ Precision precision;
+ std::vector<DatabaseVendor> vendors;
};
// The OpenCL device types
- static constexpr auto kDeviceTypeCPU = "CPU";
- static constexpr auto kDeviceTypeGPU = "GPU";
- static constexpr auto kDeviceTypeAccelerator = "accelerator";
- static constexpr auto kDeviceTypeAll = "default";
+ static const std::string kDeviceTypeCPU;
+ static const std::string kDeviceTypeGPU;
+ static const std::string kDeviceTypeAccelerator;
+ static const std::string kDeviceTypeAll;
// The OpenCL device vendors
- static constexpr auto kDeviceVendorAll = "default";
+ static const std::string kDeviceVendorAll;
// Alternative names for some OpenCL vendors
- const std::unordered_map<std::string,std::string> kVendorNames {
- {"Intel(R) Corporation", "Intel"},
- {"GenuineIntel", "Intel"},
- {"Advanced Micro Devices, Inc.", "AMD"},
- {"NVIDIA Corporation", "NVIDIA"},
- };
+ static const std::unordered_map<std::string, std::string> kVendorNames;
// The database consists of separate database entries, stored together in a vector
static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
@@ -75,10 +70,12 @@ class Database {
static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
+ static const DatabaseEntry XgemmDirectHalf, XgemmDirectSingle, XgemmDirectDouble, XgemmDirectComplexSingle, XgemmDirectComplexDouble;
static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
+ static const DatabaseEntry KernelSelectionHalf, KernelSelectionSingle, KernelSelectionDouble, KernelSelectionComplexSingle, KernelSelectionComplexDouble;
static const std::vector<DatabaseEntry> database;
// The constructor with a user-provided database overlay (potentially an empty vector)
diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp
new file mode 100644
index 00000000..c9462c7a
--- /dev/null
+++ b/src/database/kernel_selection.hpp
@@ -0,0 +1,129 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel
+// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the
+// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use
+// some common default values.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::KernelSelectionHalf = {
+ "KernelSelection", Precision::kHalf, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::KernelSelectionSingle = {
+ "KernelSelection", Precision::kSingle, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::KernelSelectionComplexSingle = {
+ "KernelSelection", Precision::kComplexSingle, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::KernelSelectionDouble = {
+ "KernelSelection", Precision::kDouble, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::KernelSelectionComplexDouble = {
+ "KernelSelection", Precision::kComplexDouble, {
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index a6b7dfe8..479c7f78 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::CopySingle = {
{ "Intel(R) HD Graphics 530", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
@@ -84,7 +85,7 @@ const Database::DatabaseEntry Database::CopySingle = {
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
- { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@@ -128,6 +129,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
@@ -147,7 +149,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
- { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
@@ -205,13 +207,13 @@ const Database::DatabaseEntry Database::CopyDouble = {
{ "GeForce GTX 670", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
- { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+ { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // Default
@@ -264,7 +266,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index 3cfabaf4..48085139 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::PadSingle = {
{ "Intel(R) HD Graphics 530", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
@@ -84,7 +85,7 @@ const Database::DatabaseEntry Database::PadSingle = {
{ "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
- { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -134,10 +135,11 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+ { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
{ // Intel accelerators
@@ -154,13 +156,13 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
{ "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
- { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+ { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // Default
@@ -272,7 +274,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
{ "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
- { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+ { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index 88bd4ea7..f9448985 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
{ "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -134,6 +135,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -272,7 +274,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 750", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+ { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 0e1b608e..191d2e98 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
{ "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@@ -134,6 +135,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
@@ -159,7 +161,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
- { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
}
@@ -207,7 +209,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 750", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
- { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 9c1bcd99..70be5abc 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
{ "Intel(R) HD Graphics 530", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",1}, {"WGS",512}, {"WPT",2} } },
{ "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
@@ -84,7 +85,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
{ "GeForce GTX 670", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
- { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -134,6 +135,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"VW",4}, {"WGS",64}, {"WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
{ "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
@@ -213,7 +215,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
{ "GeForce GTX 670", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
- { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+ { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
@@ -272,7 +274,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
{ "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
- { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
+ { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 987a990d..96a699aa 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -55,6 +55,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WGS2",32} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",512}, {"WGS2",128} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",32} } },
{ "Iris Pro", { {"WGS1",512}, {"WGS2",64} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
@@ -68,6 +69,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
{ "GeForce GTX 670", { {"WGS1",512}, {"WGS2",1024} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } },
{ "GeForce GTX 750", { {"WGS1",128}, {"WGS2",32} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } },
@@ -106,6 +108,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",512}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
{ "Iris Pro", { {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",32}, {"WGS2",32} } },
@@ -119,6 +122,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
{ "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",32} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
@@ -160,6 +164,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
{ "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",256} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
@@ -201,6 +206,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
{ "GeForce GTX 670", { {"WGS1",512}, {"WGS2",128} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX 750", { {"WGS1",256}, {"WGS2",32} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } },
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index d19c55b5..ffe3dc57 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -57,9 +57,10 @@ const Database::DatabaseEntry Database::XgemmSingle = {
{ "Intel(R) HD Graphics 530", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
{ "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
- { "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+ { "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
@@ -77,7 +78,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
{ "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "GeForce GTX 750", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
- { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
+ { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
@@ -127,6 +128,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
{ "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
@@ -147,7 +149,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
{ "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 750", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
- { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
@@ -206,7 +208,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
{ "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 750", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
- { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+ { "GeForce GTX 750 Ti", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -265,7 +267,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
{ "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 750", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
- { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+ { "GeForce GTX 750 Ti", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp
new file mode 100644
index 00000000..53dfeaa6
--- /dev/null
+++ b/src/database/kernels/xgemm_direct.hpp
@@ -0,0 +1,136 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xgemm_Direct' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDirectHalf = {
+ "XgemmDirect", Precision::kHalf, {
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDirectSingle = {
+ "XgemmDirect", Precision::kSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Iris Pro", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDirectComplexSingle = {
+ "XgemmDirect", Precision::kComplexSingle, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ }
+ },
+ { // Intel GPUs
+ kDeviceTypeGPU, "Intel", {
+ { "Iris Pro", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 750 Ti", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDirectDouble = {
+ "XgemmDirect", Precision::kDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDirectComplexDouble = {
+ "XgemmDirect", Precision::kComplexDouble, {
+ { // AMD GPUs
+ kDeviceTypeGPU, "AMD", {
+ { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+ }
+ },
+ { // NVIDIA GPUs
+ kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+ { "default", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+ }
+ },
+ { // Default
+ kDeviceTypeAll, "default", {
+ { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
+ }
+ },
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index e5e8845e..be6606a6 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -57,6 +57,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WPT1",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Iris", { {"WGS1",64}, {"WPT1",2} } },
{ "Iris Pro", { {"WGS1",256}, {"WPT1",2} } },
@@ -77,7 +78,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
{ "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 750", { {"WGS1",256}, {"WPT1",1} } },
- { "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1} } },
@@ -120,6 +121,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Iris", { {"WGS1",256}, {"WPT1",1} } },
{ "Iris Pro", { {"WGS1",64}, {"WPT1",1} } },
@@ -140,7 +142,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
{ "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 750", { {"WGS1",128}, {"WPT1",1} } },
- { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
@@ -188,7 +190,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
{ "GeForce GTX 670", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WPT1",1} } },
- { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1} } },
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 52af628c..cd430dcb 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -57,10 +57,11 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
{ "Intel(R) HD Graphics 530", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Iris", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
{ "Iris Pro", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
- { "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
+ { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
}
},
{ // Intel accelerators
@@ -77,7 +78,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
{ "GeForce GTX 670", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 750", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
- { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+ { "GeForce GTX 750 Ti", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "GeForce GTX 980", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@@ -120,6 +121,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Iris", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Iris Pro", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
@@ -139,7 +141,6 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
{ "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 670", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
- { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@@ -186,7 +187,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
{ "GeForce GTX 670", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 750", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
- { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+ { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
{ "GeForce GTX 980", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index 328094e1..2dd7db32 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -44,6 +44,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
{ "Iris Pro", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
@@ -51,8 +52,9 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 750 Ti", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
- { "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+ { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Default
@@ -83,6 +85,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
{ "Iris Pro", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } },
@@ -114,8 +117,9 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
+ { "GeForce GTX 750 Ti", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
- { "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+ { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
}
},
{ // Default
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index 3e9c25c1..7816080f 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -63,6 +63,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
{ "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
{ "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
@@ -76,8 +77,9 @@ const Database::DatabaseEntry Database::XgerSingle = {
{ "GeForce GTX 670", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
- { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+ { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
}
},
{ // Default
@@ -120,6 +122,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
+ { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
{ "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
@@ -133,6 +136,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
{ "GeForce GTX 670", { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 750", { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
}
@@ -180,8 +184,9 @@ const Database::DatabaseEntry Database::XgerDouble = {
{ "GeForce GTX 670", { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 750", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
- { "default", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+ { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
}
},
{ // Default
@@ -227,6 +232,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
{ "GeForce GTX 670", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
{ "GeForce GTX 750", { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
+ { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
}
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 223501fd..b0817242 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -204,7 +204,7 @@ R"(
#if PRECISION == 3232 || PRECISION == 6464
#define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y
#else
- #define COMPLEX_CONJUGATE(value) value = value
+ #define COMPLEX_CONJUGATE(value)
#endif
// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl
new file mode 100644
index 00000000..a8bd450e
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part1.opencl
@@ -0,0 +1,273 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any
+// pre and and post-processing kernels.
+//
+// This kernel is seperated into three files. This is part 1 out of 3.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library. Note that all parameters here have a
+// suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel.
+#ifndef WGD
+ #define WGD 8 // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64)
+#endif
+#ifndef MDIMCD
+ #define MDIMCD 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef NDIMCD
+ #define NDIMCD 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef MDIMAD
+ #define MDIMAD 8 // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
+#endif
+#ifndef NDIMBD
+ #define NDIMBD 8 // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
+#endif
+#ifndef KWID
+ #define KWID 1 // Unroll factor of the WGD loop (smaller or equal than WGD)
+#endif
+#ifndef VWMD
+ #define VWMD 1 // Vector width of matrices A and C
+#endif
+#ifndef VWND
+ #define VWND 1 // Vector width of matrix B
+#endif
+#ifndef PADA
+ #define PADA 1 // Local memory padding for matrix A
+#endif
+#ifndef PADB
+ #define PADB 1 // Local memory padding for matrix B
+#endif
+
+// Helper parameters based on the above tuning parameters
+#define MWID (WGD/MDIMCD) // Work per work-item (M-dimension)
+#define NWID (WGD/NDIMCD) // Work per work-item (N-dimension)
+#define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
+#define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
+#define MWAD (WGD/MDIMAD) // Amount of loads-per-thread for matrix A (M-dimension)
+#define KWAD (WGD/KDIMAD) // Amount of loads-per-thread for matrix A (K-dimension)
+#define KWBD (WGD/KDIMBD) // Amount of loads-per-thread for matrix B (K-dimension)
+#define NWBD (WGD/NDIMBD) // Amount of loads-per-thread for matrix B (N-dimension)
+
+// =================================================================================================
+
+// Data-widths in dimension M
+#if VWMD == 1
+ typedef real realMD;
+#elif VWMD == 2
+ typedef real2 realMD;
+#elif VWMD == 4
+ typedef real4 realMD;
+#elif VWMD == 8
+ typedef real8 realMD;
+#elif VWMD == 16
+ typedef real16 realMD;
+#endif
+
+// Data-widths in dimension N
+#if VWND == 1
+ typedef real realND;
+#elif VWND == 2
+ typedef real2 realND;
+#elif VWND == 4
+ typedef real4 realND;
+#elif VWND == 8
+ typedef real8 realND;
+#elif VWND == 16
+ typedef real16 realND;
+#endif
+
+// =================================================================================================
+
+// Initializes the accumulation registers to zero
+inline void InitAccRegistersDirect(real cpm[NWID][MWID]) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ SetToZero(cpm[ni][mi]);
+ }
+ }
+}
+
+// =================================================================================================
+
+// Performs the actual computation: Cpm += Apm * Bpm
+inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+ MultiplyAdd(cpm[ni][mi], apm[mi], bpm[ni]);
+ }
+ }
+}
+
+// =================================================================================================
+
+// Loads global off-chip memory into thread-private register files. This function is specific for
+// loading the A input matrix.
+inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID],
+ const int a_ld, const int a_offset, const int idm, const int idk,
+ const int a_transpose, const int a_conjugate) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+ const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
+ apm[mi] = agms[a_index + a_offset];
+ if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
+ }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID],
+ const int b_ld, const int b_offset, const int idn, const int idk,
+ const int b_transpose, const int b_conjugate) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
+ bpm[ni] = bgms[b_index + b_offset];
+ if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
+ }
+}
+
+// Loads global off-chip memory into thread-private register files. This function is specific for
+// loading the A input matrix. This is the same as above but now includes a bounds check.
+inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID],
+ const int a_ld, const int a_offset, const int idm, const int idk,
+ const int a_transpose, const int a_conjugate,
+ const int kSizeM) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+ if (idm + mi < kSizeM) {
+ const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
+ apm[mi] = agms[a_index + a_offset];
+ if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
+ }
+ else {
+ SetToZero(apm[mi]);
+ }
+ }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID],
+ const int b_ld, const int b_offset, const int idn, const int idk,
+ const int b_transpose, const int b_conjugate,
+ const int kSizeN) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ if (idn + ni < kSizeN) {
+ const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
+ bpm[ni] = bgms[b_index + b_offset];
+ if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
+ }
+ else {
+ SetToZero(bpm[ni]);
+ }
+ }
+}
+
+// =================================================================================================
+
+// Caches on-chip local memory into per-thread private memory (registers). This function is specific
+// for caching the A input matrix.
+inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg,
+ const int a_transpose) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+ const int mg = mi + get_local_id(0)*MWID;
+ const int index = (a_transpose) ? mg*(WGD + PADA) + kg : kg*(WGD + PADA) + mg;
+ apm[mi] = alm[index];
+ }
+}
+
+// Same as above, but now for the B input matrix
+inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg,
+ const int b_transpose) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ const int ng = ni + get_local_id(1)*NWID;
+ const int index = (b_transpose) ? ng*(WGD + PADB) + kg : kg*(WGD + PADB) + ng;
+ bpm[ni] = blm[index];
+ }
+}
+
+// =================================================================================================
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
+ const int idm, const int idn,
+ const real alpha, const real beta,
+ const int c_ld, const int c_offset, const int c_transpose) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+
+ // Determines the destination index
+ int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
+
+ // The final multiplication with alpha (in case beta == 0)
+ real result;
+ if (IsZero(beta)) {
+ Multiply(result, alpha, cpm[ni][mi]);
+ }
+ // The final multiplication with alpha and the addition with beta*C
+ else {
+ AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
+ }
+ cgm[c_index + c_offset] = result;
+ }
+ }
+}
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID],
+ const int idm, const int idn, const int kSizeM, const int kSizeN,
+ const real alpha, const real beta,
+ const int c_ld, const int c_offset, const int c_transpose) {
+ #pragma unroll
+ for (int ni=0; ni<NWID; ++ni) {
+ #pragma unroll
+ for (int mi=0; mi<MWID; ++mi) {
+ if ((idm + mi) < kSizeM && (idn + ni) < kSizeN) {
+
+ // Determines the destination index
+ int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
+
+ // The final multiplication with alpha (in case beta == 0)
+ real result;
+ if (IsZero(beta)) {
+ Multiply(result, alpha, cpm[ni][mi]);
+ }
+ // The final multiplication with alpha and the addition with beta*C
+ else {
+ AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
+ }
+ cgm[c_index + c_offset] = result;
+ }
+ }
+ }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
new file mode 100644
index 00000000..d77cbf65
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -0,0 +1,314 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix.
+inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
+ const int a_ld, const int a_offset, const int kwg,
+ const int a_transpose, const int a_conjugate) {
+ #if MDIMCD == MDIMAD
+ const int la0 = get_local_id(0);
+ const int la1 = get_local_id(1);
+ #else
+ const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+ const int la0 = tid % MDIMAD;
+ const int la1 = tid / MDIMAD;
+ #endif
+ #pragma unroll
+ for (int mia=0; mia<MWAD/VWMD; ++mia) {
+ #pragma unroll
+ for (int kia=0; kia<KWAD; ++kia) {
+
+ // Computes the indices for the global memory
+ int mg = mia + la0*(MWAD/VWMD);
+ int kg = kia + la1*KWAD;
+ int idm = (a_transpose) ? mg + kwg/VWMD : mg + GetGroupID0()*(WGD/VWMD);
+ int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+ // Loads the data from global memory into the local memory
+ const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
+ #if VWMD == 1
+ alm[kg*(WGD + PADA) + mg] = avec;
+ #elif VWMD == 2
+ alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
+ alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
+ #elif VWMD == 4
+ alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
+ alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
+ alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.z;
+ alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.w;
+ #elif VWMD == 8
+ alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
+ alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
+ alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
+ alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
+ alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
+ alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
+ alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
+ alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
+ #elif VWMD == 16
+ alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
+ alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
+ alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
+ alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
+ alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
+ alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
+ alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
+ alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
+ alm[kg*(WGD + PADA) + mg*VWMD + 8] = avec.s8;
+ alm[kg*(WGD + PADA) + mg*VWMD + 9] = avec.s9;
+ alm[kg*(WGD + PADA) + mg*VWMD + 10] = avec.sA;
+ alm[kg*(WGD + PADA) + mg*VWMD + 11] = avec.sB;
+ alm[kg*(WGD + PADA) + mg*VWMD + 12] = avec.sC;
+ alm[kg*(WGD + PADA) + mg*VWMD + 13] = avec.sD;
+ alm[kg*(WGD + PADA) + mg*VWMD + 14] = avec.sE;
+ alm[kg*(WGD + PADA) + mg*VWMD + 15] = avec.sF;
+ #endif
+ if (a_conjugate) {
+ for (int vm=0; vm<VWMD; ++vm) {
+ COMPLEX_CONJUGATE(alm[kg*(WGD + PADA) + mg*VWMD + vm]);
+ }
+ }
+ }
+ }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
+ const int b_ld, const int b_offset, const int kwg,
+ const int b_transpose, const int b_conjugate) {
+ #if MDIMCD == NDIMBD
+ const int lb0 = get_local_id(0);
+ const int lb1 = get_local_id(1);
+ #else
+ const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+ const int lb0 = tid % NDIMBD;
+ const int lb1 = tid / NDIMBD;
+ #endif
+ #pragma unroll
+ for (int kib=0; kib<KWBD; ++kib) {
+ #pragma unroll
+ for (int nib=0; nib<NWBD/VWND; ++nib) {
+
+ // Computes the indices for the global memory
+ int ng = nib + lb0*(NWBD/VWND);
+ int kg = kib + lb1*KWBD;
+ int idn = (b_transpose) ? ng + kwg/VWND : ng + GetGroupID1()*(WGD/VWND);
+ int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+ // Loads the data from global memory into the local memory
+ const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
+ #if VWND == 1
+ blm[kg*(WGD + PADB) + ng] = bvec;
+ #elif VWND == 2
+ blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
+ blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
+ #elif VWND == 4
+ blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
+ blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
+ blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.z;
+ blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.w;
+ #elif VWND == 8
+ blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
+ blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
+ blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
+ blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
+ blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
+ blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
+ blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
+ blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
+ #elif VWND == 16
+ blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
+ blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
+ blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
+ blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
+ blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
+ blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
+ blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
+ blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
+ blm[kg*(WGD + PADB) + ng*VWND + 8] = bvec.s8;
+ blm[kg*(WGD + PADB) + ng*VWND + 9] = bvec.s9;
+ blm[kg*(WGD + PADB) + ng*VWND + 10] = bvec.sA;
+ blm[kg*(WGD + PADB) + ng*VWND + 11] = bvec.sB;
+ blm[kg*(WGD + PADB) + ng*VWND + 12] = bvec.sC;
+ blm[kg*(WGD + PADB) + ng*VWND + 13] = bvec.sD;
+ blm[kg*(WGD + PADB) + ng*VWND + 14] = bvec.sE;
+ blm[kg*(WGD + PADB) + ng*VWND + 15] = bvec.sF;
+ #endif
+ if (b_conjugate) {
+ for (int vn=0; vn<VWND; ++vn) {
+ COMPLEX_CONJUGATE(blm[kg*(WGD + PADB) + ng*VWND + vn]);
+ }
+ }
+ }
+ }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix. In contrast to the functions above, this function performs doesn't
+// use the vector data-types.
+inline void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
+ const int a_ld, const int a_offset, const int kwg,
+ const int a_transpose, const int a_conjugate) {
+ #if MDIMCD == MDIMAD
+ const int la0 = get_local_id(0);
+ const int la1 = get_local_id(1);
+ #else
+ const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+ const int la0 = tid % MDIMAD;
+ const int la1 = tid / MDIMAD;
+ #endif
+ #pragma unroll
+ for (int mia=0; mia<MWAD; ++mia) {
+ #pragma unroll
+ for (int kia=0; kia<KWAD; ++kia) {
+
+ // Computes the indices for the global memory
+ int mg = mia + la0*MWAD;
+ int kg = kia + la1*KWAD;
+ int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
+ int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+ // Loads the data from global memory into the local memory
+ real result = agms[idk*a_ld + idm + a_offset];
+ if (a_conjugate) { COMPLEX_CONJUGATE(result); }
+ alm[kg*(WGD + PADA) + mg] = result;
+ }
+ }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
+ const int b_ld, const int b_offset, const int kwg,
+ const int b_transpose, const int b_conjugate) {
+ #if MDIMCD == NDIMBD
+ const int lb0 = get_local_id(0);
+ const int lb1 = get_local_id(1);
+ #else
+ const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+ const int lb0 = tid % NDIMBD;
+ const int lb1 = tid / NDIMBD;
+ #endif
+ #pragma unroll
+ for (int kib=0; kib<KWBD; ++kib) {
+ #pragma unroll
+ for (int nib=0; nib<NWBD; ++nib) {
+
+ // Computes the indices for the global memory
+ int ng = nib + lb0*NWBD;
+ int kg = kib + lb1*KWBD;
+ int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
+ int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+ // Loads the data from global memory into the local memory
+ real result = bgms[idk*b_ld + idn + b_offset];
+ if (b_conjugate) { COMPLEX_CONJUGATE(result); }
+ blm[kg*(WGD + PADB) + ng] = result;
+ }
+ }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix. In contrast to the functions above, this function performs bounds
+// checks and doesn't use the vector data-types.
+inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
+ const int a_ld, const int a_offset, const int kwg,
+ const int a_transpose, const int a_conjugate,
+ const int kSizeM, const int kSizeK) {
+ #if MDIMCD == MDIMAD
+ const int la0 = get_local_id(0);
+ const int la1 = get_local_id(1);
+ #else
+ const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+ const int la0 = tid % MDIMAD;
+ const int la1 = tid / MDIMAD;
+ #endif
+ #pragma unroll
+ for (int mia=0; mia<MWAD; ++mia) {
+ #pragma unroll
+ for (int kia=0; kia<KWAD; ++kia) {
+
+ // Computes the indices for the global memory
+ int mg = mia + la0*MWAD;
+ int kg = kia + la1*KWAD;
+ int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
+ int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+ // Loads the data from global memory into the local memory
+ int condition = (a_transpose) ? idm < kSizeK : idm < kSizeM;
+ if (condition) {
+ real result = agms[idk*a_ld + idm + a_offset];
+ if (a_conjugate) { COMPLEX_CONJUGATE(result); }
+ alm[kg*(WGD + PADA) + mg] = result;
+ }
+ else {
+ SetToZero(alm[kg*(WGD + PADA) + mg]);
+ }
+ }
+ }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
+ const int b_ld, const int b_offset, const int kwg,
+ const int b_transpose, const int b_conjugate,
+ const int kSizeN, const int kSizeK) {
+ #if MDIMCD == NDIMBD
+ const int lb0 = get_local_id(0);
+ const int lb1 = get_local_id(1);
+ #else
+ const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+ const int lb0 = tid % NDIMBD;
+ const int lb1 = tid / NDIMBD;
+ #endif
+ #pragma unroll
+ for (int kib=0; kib<KWBD; ++kib) {
+ #pragma unroll
+ for (int nib=0; nib<NWBD; ++nib) {
+
+ // Computes the indices for the global memory
+ int ng = nib + lb0*NWBD;
+ int kg = kib + lb1*KWBD;
+ int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
+ int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+ // Loads the data from global memory into the local memory
+ int condition = (b_transpose) ? idn < kSizeK : idn < kSizeN;
+ if (condition) {
+ real result = bgms[idk*b_ld + idn + b_offset];
+ if (b_conjugate) { COMPLEX_CONJUGATE(result); }
+ blm[kg*(WGD + PADB) + ng] = result;
+ }
+ else {
+ SetToZero(blm[kg*(WGD + PADB) + ng]);
+ }
+ }
+ }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
new file mode 100644
index 00000000..a9350e00
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -0,0 +1,214 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main body of the kernel. This is the direct version without pre/post processing and restrictions.
+inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha,
+ const real_arg arg_beta,
+ const __global realMD* restrict agm, const int a_offset, const int a_ld,
+ const __global realND* restrict bgm, const int b_offset, const int b_ld,
+ __global real* cgm, const int c_offset, const int c_ld,
+ __local real* alm, __local real* blm,
+ const int a_transpose, const int b_transpose, const int c_transpose,
+ const int a_conjugate, const int b_conjugate) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ // Extra pointers to scalar versions of global memory
+ const __global real* restrict agms = (const __global real* restrict) agm;
+ const __global real* restrict bgms = (const __global real* restrict) bgm;
+
+ // Allocates workitem-private memory (registers)
+ real apm[MWID];
+ real bpm[NWID];
+ real cpm[NWID][MWID];
+
+ // Initializes the accumulation registers
+ InitAccRegistersDirect(cpm);
+
+ // The faster version of GEMM is not allowed on the (incomplete) borders. Therefore, this section
+ // processes only the main parts: output blocks of WGD by WGD.
+ const int idm = get_local_id(0) * MWID + GetGroupID0() * WGD;
+ const int idn = get_local_id(1) * NWID + GetGroupID1() * WGD;
+ if ((idm < (kSizeM/WGD)*WGD) && (idn < (kSizeN/WGD)*WGD)) {
+
+ // Loops over all complete workgroup tiles (K-dimension)
+ int kwg = 0;
+ for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
+
+ // Loads data: off-chip --> local (matrix A and B)
+ if (a_ld % VWMD == 0) {
+ GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
+ }
+ else {
+ GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
+ }
+ if (b_ld % VWND == 0) {
+ GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
+ }
+ else {
+ GlobalToLocalScalarB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Loops over all workitem tiles, unrolled by a factor KWID
+ for (int pwi=0; pwi<WGD; pwi+=KWID) {
+ #pragma unroll
+ for (int pit=0; pit<KWID; ++pit) {
+ int kg = pwi + pit;
+
+ // Loads data: local --> private (matrix A and B)
+ LocalToPrivateDirectA(alm, apm, kg, a_transpose);
+ LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
+
+ // Performs the accumulation (Cpm += Apm * Bpm)
+ MultiplyAccumulateDirect(cpm, apm, bpm);
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ // Loop over the remaining part (incomplete tile in K-dimension)
+ for (; kwg < kSizeK; ++kwg) {
+
+ // Loads data: off-chip --> private (matrix A and B)
+ GlobalToPrivateDirectA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate);
+ GlobalToPrivateDirectB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate);
+
+ // Performs the accumulation (Cpm += Apm * Bpm)
+ MultiplyAccumulateDirect(cpm, apm, bpm);
+ }
+
+ // Stores a tile of results and performs the multiplication with alpha and beta
+ StoreResultsDirect(cgm, cpm, idm, idn, alpha, beta, c_ld, c_offset, c_transpose);
+ }
+
+ // Simple but slower version for the parts on the edge (incomplete tiles in M and N-dimensions)
+ else {
+
+ // Loops over all complete workgroup tiles (K-dimension)
+ int kwg = 0;
+ for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
+
+ // Loads data: off-chip --> local (matrix A and B)
+ GlobalToLocalCheckedA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate, kSizeM, kSizeK);
+ GlobalToLocalCheckedB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate, kSizeN, kSizeK);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Loops over all workitem tiles, unrolled by a factor KWID
+ for (int pwi=0; pwi<WGD; pwi+=KWID) {
+ #pragma unroll
+ for (int pit=0; pit<KWID; ++pit) {
+ int kg = pwi + pit;
+
+ // Loads data: local --> private (matrix A and B)
+ LocalToPrivateDirectA(alm, apm, kg, a_transpose);
+ LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
+
+ // Performs the accumulation (Cpm += Apm * Bpm)
+ MultiplyAccumulateDirect(cpm, apm, bpm);
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ // Loop over the remaining part (incomplete tile in K-dimension)
+ for (; kwg < kSizeK; ++kwg) {
+
+ // Loads data: off-chip --> private (matrix A and B)
+ GlobalToPrivateCheckedA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate, kSizeM);
+ GlobalToPrivateCheckedB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate, kSizeN);
+
+ // Performs the accumulation (Cpm += Apm * Bpm)
+ MultiplyAccumulateDirect(cpm, apm, bpm);
+ }
+
+ // Stores a tile of results and performs the multiplication with alpha and beta
+ StoreResultsChecked(cgm, cpm, idm, idn, kSizeM, kSizeN, alpha, beta, c_ld, c_offset, c_transpose);
+ }
+}
+
+// =================================================================================================
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha, const real_arg arg_beta,
+ const __global realMD* restrict agm, const int a_offset, const int a_ld,
+ const __global realND* restrict bgm, const int b_offset, const int b_ld,
+ __global real* cgm, const int c_offset, const int c_ld,
+ const int c_transpose, const int a_conjugate, const int b_conjugate) {
+ __local real alm[WGD * (WGD + PADA)];
+ __local real blm[WGD * (WGD + PADB)];
+ XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+ agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+ alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha, const real_arg arg_beta,
+ const __global realMD* restrict agm, const int a_offset, const int a_ld,
+ const __global realND* restrict bgm, const int b_offset, const int b_ld,
+ __global real* cgm, const int c_offset, const int c_ld,
+ const int c_transpose, const int a_conjugate, const int b_conjugate) {
+ __local real alm[WGD * (WGD + PADA)];
+ __local real blm[WGD * (WGD + PADB)];
+ XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+ agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+ alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha, const real_arg arg_beta,
+ const __global realMD* restrict agm, const int a_offset, const int a_ld,
+ const __global realND* restrict bgm, const int b_offset, const int b_ld,
+ __global real* cgm, const int c_offset, const int c_ld,
+ const int c_transpose, const int a_conjugate, const int b_conjugate) {
+ __local real alm[WGD * (WGD + PADA)];
+ __local real blm[WGD * (WGD + PADB)];
+ XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+ agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+ alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha, const real_arg arg_beta,
+ const __global realMD* restrict agm, const int a_offset, const int a_ld,
+ const __global realND* restrict bgm, const int b_offset, const int b_ld,
+ __global real* cgm, const int c_offset, const int c_ld,
+ const int c_transpose, const int a_conjugate, const int b_conjugate) {
+ __local real alm[WGD * (WGD + PADA)];
+ __local real blm[WGD * (WGD + PADB)];
+ XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+ agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+ alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/msvc.hpp b/src/msvc.hpp
new file mode 100644
index 00000000..a45105df
--- /dev/null
+++ b/src/msvc.hpp
@@ -0,0 +1,39 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides macro's and definitions to make compilation work on Microsoft Visual Studio,
+// in particular for versions older than 2015 with limited C++11 support.
+// MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015)
+// MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013)
+// MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012)
+// MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010)
+// MSVC++ 9.0 _MSC_VER == 1500 (Visual Studio 2008)
+//
+// =================================================================================================
+
+#ifndef CLBLAST_MSVC_HPP_
+#define CLBLAST_MSVC_HPP_
+
+namespace clblast {
+// =================================================================================================
+#ifdef _MSC_VER
+
+// No support for constexpr prior to 2015. Note that this only works with constants, not with
+// constexpr functions (unused in this project).
+#if _MSC_VER < 1900
+#define constexpr const
+#endif
+
+// _MSC_VER
+#endif
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_MSVC_HPP_
+#endif
diff --git a/src/routine.cpp b/src/routine.cpp
index 189ae190..d938d66f 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -14,6 +14,7 @@
#include <string>
#include <vector>
#include <chrono>
+#include <cstdlib>
#include "routine.hpp"
@@ -42,13 +43,19 @@ StatusCode Routine::SetUp() {
// Queries the cache to see whether or not the program (context-specific) is already there
if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
+ // Sets the build options from an environmental variable (if set)
+ auto options = std::vector<std::string>();
+ const auto environment_variable = std::getenv("CLBLAST_BUILD_OPTIONS");
+ if (environment_variable != nullptr) {
+ options.push_back(std::string(environment_variable));
+ }
+
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
// is, a program is created and stored in the cache
if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
try {
auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
auto program = Program(device_, context_, binary);
- auto options = std::vector<std::string>();
program.Build(device_, options);
StoreProgramToCache(program, context_, precision_, routine_name_);
} catch (...) { return StatusCode::kBuildProgramFailure; }
@@ -115,7 +122,6 @@ StatusCode Routine::SetUp() {
// Compiles the kernel
try {
auto program = Program(context_, source_string);
- auto options = std::vector<std::string>();
const auto build_status = program.Build(device_, options);
// Checks for compiler crashes/errors/warnings
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 0b8e768f..1602c69f 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -22,7 +22,9 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
- Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
+ Routine(queue, event, name,
+ {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
+ PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
@@ -32,10 +34,16 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/convert_symmetric.opencl"
#include "../../kernels/level3/convert_triangular.opencl"
#include "../../kernels/level3/convert_hermitian.opencl"
+ #include "../../kernels/level3/xgemm_direct_part1.opencl"
+ #include "../../kernels/level3/xgemm_direct_part2.opencl"
+ #include "../../kernels/level3/xgemm_direct_part3.opencl"
+ ;
+ auto source_string_part_2 = // separated in two parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
;
+ source_string_ += source_string_part_2;
}
// =================================================================================================
@@ -98,6 +106,44 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
if (ErrorIn(status)) { return status; }
+ // Selects which version of GEMM to run
+ const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
+ if (do_gemm_direct) { // for small sizes (single kernel)
+ return GemmDirect(m, n, k, alpha,
+ a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+ c_buffer, c_offset, c_ld,
+ a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
+ }
+ else { // for larger sizes (pre/post-processing plus a very fast kernel)
+ return GemmIndirect(m, n, k, alpha,
+ a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+ c_buffer, c_offset, c_ld,
+ a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
+ a_one, a_two, a_want_rotated,
+ b_one, b_two, b_want_rotated,
+ c_one, c_two, c_want_rotated);
+ }
+}
+
+// =================================================================================================
+
+// The indirect version of GEMM. This uses the faster but non-general kernel. It has specific
+// requirements, but several pre and post-processing kernels take care of those. However, the
+// overhead of these extra kernels might not be ideal for certain devices/arguments.
+template <typename T>
+StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate,
+ const size_t a_one, const size_t a_two, const bool a_want_rotated,
+ const size_t b_one, const size_t b_two, const bool b_want_rotated,
+ const size_t c_one, const size_t c_two, const bool c_want_rotated) {
+ auto status = StatusCode::kSuccess;
+
// Calculates the ceiled versions of m, n, and k
const auto m_ceiled = Ceil(m, db_["MWG"]);
const auto n_ceiled = Ceil(n, db_["NWG"]);
@@ -217,6 +263,66 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
}
+
+// =================================================================================================
+
+// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
+template <typename T>
+StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate) {
+
+ // Loads the program from the database
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+ // Retrieves the proper XgemmDirect kernel from the compiled binary
+ try {
+ const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
+ (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
+ auto kernel = Kernel(program, name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(m));
+ kernel.SetArgument(1, static_cast<int>(n));
+ kernel.SetArgument(2, static_cast<int>(k));
+ kernel.SetArgument(3, GetRealArg(alpha));
+ kernel.SetArgument(4, GetRealArg(beta));
+ kernel.SetArgument(5, a_buffer());
+ kernel.SetArgument(6, static_cast<int>(a_offset));
+ kernel.SetArgument(7, static_cast<int>(a_ld));
+ kernel.SetArgument(8, b_buffer());
+ kernel.SetArgument(9, static_cast<int>(b_offset));
+ kernel.SetArgument(10, static_cast<int>(b_ld));
+ kernel.SetArgument(11, c_buffer());
+ kernel.SetArgument(12, static_cast<int>(c_offset));
+ kernel.SetArgument(13, static_cast<int>(c_ld));
+ kernel.SetArgument(14, static_cast<int>(c_do_transpose));
+ kernel.SetArgument(15, static_cast<int>(a_conjugate));
+ kernel.SetArgument(16, static_cast<int>(b_conjugate));
+
+ // Computes the global and local thread sizes
+ const auto m_ceiled = Ceil(m, db_["WGD"]);
+ const auto n_ceiled = Ceil(n, db_["WGD"]);
+ const auto global = std::vector<size_t>{
+ (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
+ (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
+ };
+ const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
+
+ // Launches the kernel
+ auto status = RunKernel(kernel, queue_, device_, global, local, event_);
+ if (ErrorIn(status)) { return status; }
+
+ // Successfully finished the computation
+ return StatusCode::kSuccess;
+ } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
// =================================================================================================
// Compiles the templated class
diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp
index bc51c7f5..46e12453 100644
--- a/src/routines/level3/xgemm.hpp
+++ b/src/routines/level3/xgemm.hpp
@@ -35,6 +35,29 @@ class Xgemm: public Routine {
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ // Indirect version of GEMM (with pre and post-processing kernels)
+ StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate,
+ const size_t a_one, const size_t a_two, const bool a_want_rotated,
+ const size_t b_one, const size_t b_two, const bool b_want_rotated,
+ const size_t c_one, const size_t c_two, const bool c_want_rotated);
+
+ // Direct version of GEMM (no pre and post-processing kernels)
+ StatusCode GemmDirect(const size_t m, const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate);
};
// =================================================================================================
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index 78ded56e..c57aab39 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -47,6 +47,7 @@ class TuneCopy {
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 90f5ea82..9486ee8d 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -47,6 +47,7 @@ class TunePad {
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index 10fa80cb..2d9d5e49 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -47,6 +47,7 @@ class TuneTranspose {
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index 507718eb..d364dabe 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -47,6 +47,7 @@ class TunePadTranspose {
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 0033b3c6..403ee9e4 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -51,6 +51,7 @@ class TuneXaxpy {
static size_t DefaultN() { return 4096*1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp
index 1581e13f..f8416761 100644
--- a/src/tuning/kernels/xdot.cpp
+++ b/src/tuning/kernels/xdot.cpp
@@ -47,6 +47,7 @@ class TuneXdot {
static size_t DefaultN() { return 2*1024*1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 4cb7fd00..0eb1875b 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -52,6 +52,7 @@ class TuneXgemm {
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1024; }
static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@@ -126,10 +127,10 @@ class TuneXgemm {
// Sets the local memory size
static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
auto LocalMemorySize = [args] (std::vector<size_t> v) {
- return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision));
+ return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision));
};
- tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM",
- "SB", "KWG", "NWG", "VWN"});
+ tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG",
+ "SB", "KWG", "NWG"});
}
// Sets the base thread configuration
diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp
new file mode 100644
index 00000000..204e0be4
--- /dev/null
+++ b/src/tuning/kernels/xgemm_direct.cpp
@@ -0,0 +1,196 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file uses the CLTune auto-tuner to tune the direct xgemm kernels. There are two variations:
+// - V==1: This tests some limited set of tuning parameters exhaustively.
+// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, int V>
+class TuneXgemmDirect {
+ public:
+
+ // The representative kernel and the source code
+ static std::string KernelFamily() { return (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; }
+ static std::string KernelName() { return "XgemmDirectTN"; }
+ static std::string GetSources() {
+ return
+ #include "../src/kernels/common.opencl"
+ #include "../src/kernels/level3/xgemm_direct_part1.opencl"
+ #include "../src/kernels/level3/xgemm_direct_part2.opencl"
+ #include "../src/kernels/level3/xgemm_direct_part3.opencl"
+ ;
+ }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction};
+ }
+
+ // Tests for valid arguments
+ static void TestValidArguments(const Arguments<T> &) { }
+
+ // Sets the default values for the arguments
+ static size_t DefaultM() { return 256; }
+ static size_t DefaultN() { return 256; }
+ static size_t DefaultK() { return 256; }
+ static double DefaultFraction() { return (V==1) ? 1.0 : 32.0; } // test all or sample randomly
+ static size_t DefaultNumRuns() { return 4; } // run every kernel this many times for averaging
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
+ static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
+ static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.k; }
+ static size_t GetSizeB(const Arguments<T> &args) { return args.n * args.k; }
+ static size_t GetSizeC(const Arguments<T> &args) { return args.m * args.n; }
+ static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
+
+ // Sets the tuning parameters and their possible values
+ static void SetParameters(cltune::Tuner &tuner, const size_t id) {
+ if (V==1) { // limited subset of tuning parameters - but explorable exhaustively
+ tuner.AddParameter(id, "WGD", {8, 16, 32});
+ tuner.AddParameter(id, "MDIMCD", {8, 16, 32});
+ tuner.AddParameter(id, "NDIMCD", {8, 16, 32});
+ tuner.AddParameter(id, "MDIMAD", {8, 16, 32});
+ tuner.AddParameter(id, "NDIMBD", {8, 16, 32});
+ tuner.AddParameter(id, "KWID", {2});
+ tuner.AddParameter(id, "VWMD", {1, 2, 4, 8});
+ tuner.AddParameter(id, "VWND", {1, 2, 4, 8});
+ tuner.AddParameter(id, "PADA", {1});
+ tuner.AddParameter(id, "PADB", {1});
+ } // a lot more tuning parameters - has to be sampled randomly, too much to test all
+ else {
+ tuner.AddParameter(id, "WGD", {8, 16, 32, 64, 128});
+ tuner.AddParameter(id, "MDIMCD", {8, 16, 32});
+ tuner.AddParameter(id, "NDIMCD", {8, 16, 32});
+ tuner.AddParameter(id, "MDIMAD", {8, 16, 32});
+ tuner.AddParameter(id, "NDIMBD", {8, 16, 32});
+ tuner.AddParameter(id, "KWID", {2, 8, 16});
+ tuner.AddParameter(id, "VWMD", {1, 2, 4, 8});
+ tuner.AddParameter(id, "VWND", {1, 2, 4, 8});
+ tuner.AddParameter(id, "PADA", {0, 1});
+ tuner.AddParameter(id, "PADB", {0, 1});
+ }
+ }
+
+ // Sets the constraints
+ static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+ auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
+ auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
+ auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
+ // Requirement for unrolling the WGD loop
+ tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"});
+ // Required for integer MWID and NWID
+ tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"});
+ tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"});
+ // Required for integer MWIAD and NWIBD
+ tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"});
+ tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"});
+ // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...)
+ tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"});
+ tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"});
+
+ // Extra constraints for variation 1 to limit the set of options significantly
+ if (V==1) {
+ auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
+ tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"});
+ tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"});
+ }
+ }
+
+ // Sets the local memory size
+ static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
+ auto LocalMemorySize = [args] (std::vector<size_t> v) {
+ return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision));
+ };
+ tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"});
+ }
+
+ // Sets the base thread configuration
+ static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
+ static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
+ static std::vector<size_t> LocalSize() { return {1, 1}; }
+ static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
+
+ // Transforms the thread configuration based on the parameters
+ using TransformVector = std::vector<std::vector<std::string>>;
+ static TransformVector MulLocal() { return {{"MDIMCD", "NDIMCD"}}; }
+ static TransformVector DivLocal() { return {}; }
+ static TransformVector MulGlobal() { return {{"MDIMCD", "NDIMCD"}}; }
+ static TransformVector DivGlobal() { return {{"WGD", "WGD"}}; }
+
+ // Sets the kernel's arguments
+ static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
+ std::vector<T> &, std::vector<T> &,
+ std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
+ std::vector<T> &) {
+ tuner.AddArgumentScalar(static_cast<int>(args.m));
+ tuner.AddArgumentScalar(static_cast<int>(args.n));
+ tuner.AddArgumentScalar(static_cast<int>(args.k));
+ tuner.AddArgumentScalar(GetRealArg(args.alpha));
+ tuner.AddArgumentScalar(GetRealArg(args.beta));
+ tuner.AddArgumentInput(a_mat);
+ tuner.AddArgumentScalar(0); // a_offset
+ tuner.AddArgumentScalar(static_cast<int>(args.k)); // a_ld
+ tuner.AddArgumentInput(b_mat);
+ tuner.AddArgumentScalar(0); // b_offset
+ tuner.AddArgumentScalar(static_cast<int>(args.n)); // b_ld
+ tuner.AddArgumentOutput(c_mat);
+ tuner.AddArgumentScalar(0); // c_offset
+ tuner.AddArgumentScalar(static_cast<int>(args.n)); // c_ld
+ tuner.AddArgumentScalar(1); // c_do_transpose
+ tuner.AddArgumentScalar(0); // a_conjugate
+ tuner.AddArgumentScalar(0); // b_conjugate
+ }
+
+ // Describes how to compute the performance metrics
+ static size_t GetMetric(const Arguments<T> &args) {
+ return 2 * args.m * args.n * args.k;
+ }
+ static std::string PerformanceUnit() { return "GFLOPS"; }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Function to tune a specific variation V (not within the clblast namespace)
+template <int V>
+void StartVariation(int argc, char *argv[]) {
+ switch(clblast::GetPrecision(argc, argv)) {
+ case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemmDirect<half,V>, half>(argc, argv); break;
+ case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemmDirect<float,V>, float>(argc, argv); break;
+ case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemmDirect<double,V>, double>(argc, argv); break;
+ case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemmDirect<float2,V>, float2>(argc, argv); break;
+ case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemmDirect<double2,V>, double2>(argc, argv); break;
+ }
+}
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+ StartVariation<1>(argc, argv);
+ StartVariation<2>(argc, argv);
+ return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index 7229602d..f332f52a 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -50,6 +50,7 @@ class TuneXgemv {
static size_t DefaultN() { return 2048; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index 1fb5c531..c3d0c7dd 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -47,6 +47,7 @@ class TuneXger {
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
+ static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) { return args.m; }
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 19df5f9a..afb092bc 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -30,6 +30,7 @@ namespace clblast {
// that it is automatically compiled for the various kernels (given as the 'C' template argument).
template <typename C, typename T>
void Tuner(int argc, char* argv[]) {
+ constexpr auto kSeed = 42; // fixed seed for reproducibility
// Sets the parameters and platform/device for which to tune (command-line options)
auto help = std::string{"* Options given/available:\n"};
@@ -45,6 +46,8 @@ void Tuner(int argc, char* argv[]) {
if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); }
}
+ const auto num_runs = GetArgument(argc, argv, help, kArgNumRuns, C::DefaultNumRuns());
+
fprintf(stdout, "%s\n", help.c_str());
// Tests validity of the given arguments
@@ -73,12 +76,12 @@ void Tuner(int argc, char* argv[]) {
auto b_mat = std::vector<T>(C::GetSizeB(args));
auto c_mat = std::vector<T>(C::GetSizeC(args));
auto temp = std::vector<T>(C::GetSizeTemp(args));
- PopulateVector(x_vec);
- PopulateVector(y_vec);
- PopulateVector(a_mat);
- PopulateVector(b_mat);
- PopulateVector(c_mat);
- PopulateVector(temp);
+ PopulateVector(x_vec, kSeed);
+ PopulateVector(y_vec, kSeed);
+ PopulateVector(a_mat, kSeed);
+ PopulateVector(b_mat, kSeed);
+ PopulateVector(c_mat, kSeed);
+ PopulateVector(temp, kSeed);
// Initializes the tuner for the chosen device
cltune::Tuner tuner(args.platform_id, args.device_id);
@@ -126,6 +129,7 @@ void Tuner(int argc, char* argv[]) {
C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);
// Starts the tuning process
+ tuner.SetNumRuns(num_runs);
tuner.Tune();
// Prints the results to screen
@@ -134,7 +138,7 @@ void Tuner(int argc, char* argv[]) {
// Also prints the performance of the best-case in terms of GB/s or GFLOPS
if (time_ms != 0.0) {
- printf("[ -------> ] %.1lf ms", time_ms);
+ printf("[ -------> ] %.2lf ms", time_ms);
printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str());
}
diff --git a/src/utilities.cpp b/src/utilities.cpp
index 77bc72d7..86cc2d13 100644
--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -270,40 +270,40 @@ unsigned int GetRandomSeed() {
// Create a random number generator and populates a vector with samples from a random distribution
template <typename T>
-void PopulateVector(std::vector<T> &vector) {
+void PopulateVector(std::vector<T> &vector, const unsigned int seed) {
auto lower_limit = static_cast<T>(kTestDataLowerLimit);
auto upper_limit = static_cast<T>(kTestDataUpperLimit);
- std::mt19937 mt(GetRandomSeed());
+ std::mt19937 mt(seed);
std::uniform_real_distribution<T> dist(lower_limit, upper_limit);
for (auto &element: vector) { element = dist(mt); }
}
-template void PopulateVector<float>(std::vector<float>&);
-template void PopulateVector<double>(std::vector<double>&);
+template void PopulateVector<float>(std::vector<float>&, const unsigned int);
+template void PopulateVector<double>(std::vector<double>&, const unsigned int);
// Specialized versions of the above for complex data-types
template <>
-void PopulateVector(std::vector<float2> &vector) {
+void PopulateVector(std::vector<float2> &vector, const unsigned int seed) {
auto lower_limit = static_cast<float>(kTestDataLowerLimit);
auto upper_limit = static_cast<float>(kTestDataUpperLimit);
- std::mt19937 mt(GetRandomSeed());
+ std::mt19937 mt(seed);
std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
}
template <>
-void PopulateVector(std::vector<double2> &vector) {
+void PopulateVector(std::vector<double2> &vector, const unsigned int seed) {
auto lower_limit = static_cast<double>(kTestDataLowerLimit);
auto upper_limit = static_cast<double>(kTestDataUpperLimit);
- std::mt19937 mt(GetRandomSeed());
+ std::mt19937 mt(seed);
std::uniform_real_distribution<double> dist(lower_limit, upper_limit);
for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
}
// Specialized versions of the above for half-precision
template <>
-void PopulateVector(std::vector<half> &vector) {
+void PopulateVector(std::vector<half> &vector, const unsigned int seed) {
const auto lower_limit = static_cast<float>(kTestDataLowerLimit);
const auto upper_limit = static_cast<float>(kTestDataUpperLimit);
- std::mt19937 mt(GetRandomSeed());
+ std::mt19937 mt(seed);
std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
for (auto &element: vector) { element = FloatToHalf(dist(mt)); }
}
diff --git a/src/utilities.hpp b/src/utilities.hpp
index 75bd5a69..038a8a96 100644
--- a/src/utilities.hpp
+++ b/src/utilities.hpp
@@ -25,6 +25,8 @@
#include "clblast_half.h"
#include "clpp11.hpp"
+#include "msvc.hpp"
+
namespace clblast {
// =================================================================================================
@@ -206,7 +208,7 @@ bool CheckArgument(const int argc, char *argv[], std::string &help, const std::s
// =================================================================================================
// Helper function to check for errors in the status code
-constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
+inline bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
// =================================================================================================
@@ -219,7 +221,7 @@ constexpr auto kTestDataUpperLimit = 2.0;
// Populates a vector with random data
template <typename T>
-void PopulateVector(std::vector<T> &vector);
+void PopulateVector(std::vector<T> &vector, const unsigned int seed);
// =================================================================================================
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index 2e751255..fc908b9e 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -66,13 +66,13 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
scalar_source_.resize(std::max(max_mat, max_matvec) + max_offset);
- PopulateVector(x_source_);
- PopulateVector(y_source_);
- PopulateVector(a_source_);
- PopulateVector(b_source_);
- PopulateVector(c_source_);
- PopulateVector(ap_source_);
- PopulateVector(scalar_source_);
+ PopulateVector(x_source_, kSeed);
+ PopulateVector(y_source_, kSeed);
+ PopulateVector(a_source_, kSeed);
+ PopulateVector(b_source_, kSeed);
+ PopulateVector(c_source_, kSeed);
+ PopulateVector(ap_source_, kSeed);
+ PopulateVector(scalar_source_, kSeed);
}
// ===============================================================================================
diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp
index d01cd06c..4b773801 100644
--- a/test/correctness/testblas.hpp
+++ b/test/correctness/testblas.hpp
@@ -30,6 +30,7 @@ namespace clblast {
template <typename T, typename U>
class TestBlas: public Tester<T,U> {
public:
+ static constexpr auto kSeed = 42; // fixed seed for reproducibility
// Uses several variables from the Tester class
using Tester<T,U>::context_;
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index 362c5c2c..41e457b6 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -358,28 +358,33 @@ void Tester<T,U>::PrintErrorLog(const std::vector<ErrorLogEntry> &error_log) {
// Compares two floating point values and returns whether they are within an acceptable error
// margin. This replaces GTest's EXPECT_NEAR().
template <typename T>
-bool TestSimilarity(const T val1, const T val2) {
+bool TestSimilarityNear(const T val1, const T val2,
+ const T error_margin_absolute, const T error_margin_relative) {
const auto difference = std::fabs(val1 - val2);
- // Set the allowed error margin for floating-point comparisons
- constexpr auto kErrorMarginRelative = T(0.025);
- constexpr auto kErrorMarginAbsolute = T(1.0e-3);
-
// Shortcut, handles infinities
if (val1 == val2) {
return true;
}
// The values are zero or very small: the relative error is less meaningful
- else if (val1 == 0 || val2 == 0 || difference < kErrorMarginAbsolute) {
- return (difference < kErrorMarginAbsolute);
+ else if (val1 == 0 || val2 == 0 || difference < error_margin_absolute) {
+ return (difference < error_margin_absolute);
}
// Use relative error
else {
const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
- return (difference / absolute_sum) < kErrorMarginRelative;
+ return (difference / absolute_sum) < error_margin_relative;
}
}
+// Default method for similarity testing
+template <typename T>
+bool TestSimilarity(const T val1, const T val2) {
+ constexpr auto kErrorMarginRelative = T(0.025);
+ constexpr auto kErrorMarginAbsolute = T(0.001);
+ return TestSimilarityNear(val1, val2, kErrorMarginRelative, kErrorMarginAbsolute);
+}
+
// Compiles the default case for standard data-types
template bool TestSimilarity<float>(const float, const float);
template bool TestSimilarity<double>(const double, const double);
@@ -399,7 +404,10 @@ bool TestSimilarity(const double2 val1, const double2 val2) {
}
template <>
bool TestSimilarity(const half val1, const half val2) {
- return TestSimilarity(HalfToFloat(val1), HalfToFloat(val2));
+ constexpr auto kErrorMarginRelative = float(0.050);
+ constexpr auto kErrorMarginAbsolute = float(0.002);
+ return TestSimilarityNear(HalfToFloat(val1), HalfToFloat(val2),
+ kErrorMarginRelative, kErrorMarginAbsolute);
}
// =================================================================================================
diff --git a/test/performance/client.cpp b/test/performance/client.cpp
index aaaab22e..cbb10d10 100644
--- a/test/performance/client.cpp
+++ b/test/performance/client.cpp
@@ -178,13 +178,13 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
std::vector<T> c_source(args.c_size);
std::vector<T> ap_source(args.ap_size);
std::vector<T> scalar_source(args.scalar_size);
- PopulateVector(x_source);
- PopulateVector(y_source);
- PopulateVector(a_source);
- PopulateVector(b_source);
- PopulateVector(c_source);
- PopulateVector(ap_source);
- PopulateVector(scalar_source);
+ PopulateVector(x_source, kSeed);
+ PopulateVector(y_source, kSeed);
+ PopulateVector(a_source, kSeed);
+ PopulateVector(b_source, kSeed);
+ PopulateVector(c_source, kSeed);
+ PopulateVector(ap_source, kSeed);
+ PopulateVector(scalar_source, kSeed);
// Creates the matrices on the device
auto x_vec = Buffer<T>(context, args.x_size);
diff --git a/test/performance/client.hpp b/test/performance/client.hpp
index 6d35fced..381ba158 100644
--- a/test/performance/client.hpp
+++ b/test/performance/client.hpp
@@ -40,6 +40,7 @@ namespace clblast {
template <typename T, typename U>
class Client {
public:
+ static constexpr auto kSeed = 42; // fixed seed for reproducibility
// Shorthand for the routine-specific functions passed to the tester
using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp
index 4423845e..f98bdb06 100644
--- a/test/routines/level1/xamax.hpp
+++ b/test/routines/level1/xamax.hpp
@@ -76,7 +76,7 @@ class TestXamax {
buffers.scalar(), args.imax_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp
index b1f02dcd..64aa37c2 100644
--- a/test/routines/level1/xasum.hpp
+++ b/test/routines/level1/xasum.hpp
@@ -76,7 +76,7 @@ class TestXasum {
buffers.scalar(), args.asum_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp
index c276a42e..b24e6fe8 100644
--- a/test/routines/level1/xaxpy.hpp
+++ b/test/routines/level1/xaxpy.hpp
@@ -77,7 +77,7 @@ class TestXaxpy {
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp
index a96bb9ae..87bc21d4 100644
--- a/test/routines/level1/xcopy.hpp
+++ b/test/routines/level1/xcopy.hpp
@@ -76,7 +76,7 @@ class TestXcopy {
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp
index f6cf2809..c4f6076a 100644
--- a/test/routines/level1/xdot.hpp
+++ b/test/routines/level1/xdot.hpp
@@ -81,7 +81,7 @@ class TestXdot {
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp
index 2b00d04b..aae892a8 100644
--- a/test/routines/level1/xdotc.hpp
+++ b/test/routines/level1/xdotc.hpp
@@ -81,7 +81,7 @@ class TestXdotc {
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp
index 31a867e0..f6be385b 100644
--- a/test/routines/level1/xdotu.hpp
+++ b/test/routines/level1/xdotu.hpp
@@ -81,7 +81,7 @@ class TestXdotu {
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp
index 62d649e3..e604077c 100644
--- a/test/routines/level1/xnrm2.hpp
+++ b/test/routines/level1/xnrm2.hpp
@@ -76,7 +76,7 @@ class TestXnrm2 {
buffers.scalar(), args.nrm2_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp
index 79926890..3c438bd6 100644
--- a/test/routines/level1/xscal.hpp
+++ b/test/routines/level1/xscal.hpp
@@ -72,7 +72,7 @@ class TestXscal {
auto status = Scal(args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp
index 8f7e4cfe..a0491f12 100644
--- a/test/routines/level1/xswap.hpp
+++ b/test/routines/level1/xswap.hpp
@@ -76,7 +76,7 @@ class TestXswap {
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp
index 5a907077..5ed92aae 100644
--- a/test/routines/level2/xgbmv.hpp
+++ b/test/routines/level2/xgbmv.hpp
@@ -90,7 +90,7 @@ class TestXgbmv {
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp
index 1499b2d2..9ee6d535 100644
--- a/test/routines/level2/xgemv.hpp
+++ b/test/routines/level2/xgemv.hpp
@@ -90,7 +90,7 @@ class TestXgemv {
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp
index 5cbed505..42283107 100644
--- a/test/routines/level2/xger.hpp
+++ b/test/routines/level2/xger.hpp
@@ -86,7 +86,7 @@ class TestXger {
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp
index d50092cb..ef69c197 100644
--- a/test/routines/level2/xgerc.hpp
+++ b/test/routines/level2/xgerc.hpp
@@ -86,7 +86,7 @@ class TestXgerc {
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp
index 9c823b73..b2afc6d8 100644
--- a/test/routines/level2/xgeru.hpp
+++ b/test/routines/level2/xgeru.hpp
@@ -86,7 +86,7 @@ class TestXgeru {
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp
index 01cb3f51..8bda4d0c 100644
--- a/test/routines/level2/xhbmv.hpp
+++ b/test/routines/level2/xhbmv.hpp
@@ -84,7 +84,7 @@ class TestXhbmv {
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp
index dadd3975..80565d04 100644
--- a/test/routines/level2/xhemv.hpp
+++ b/test/routines/level2/xhemv.hpp
@@ -84,7 +84,7 @@ class TestXhemv {
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp
index b21c0a9b..d71c8009 100644
--- a/test/routines/level2/xher.hpp
+++ b/test/routines/level2/xher.hpp
@@ -79,7 +79,7 @@ class TestXher {
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp
index 070f823c..083dfa2f 100644
--- a/test/routines/level2/xher2.hpp
+++ b/test/routines/level2/xher2.hpp
@@ -84,7 +84,7 @@ class TestXher2 {
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp
index d7f9634e..1dd63562 100644
--- a/test/routines/level2/xhpmv.hpp
+++ b/test/routines/level2/xhpmv.hpp
@@ -84,7 +84,7 @@ class TestXhpmv {
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp
index 8f44a68d..a5c77811 100644
--- a/test/routines/level2/xhpr.hpp
+++ b/test/routines/level2/xhpr.hpp
@@ -79,7 +79,7 @@ class TestXhpr {
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.ap_mat(), args.ap_offset,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp
index 666a8dfc..d09178f0 100644
--- a/test/routines/level2/xhpr2.hpp
+++ b/test/routines/level2/xhpr2.hpp
@@ -84,7 +84,7 @@ class TestXhpr2 {
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.ap_mat(), args.ap_offset,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp
index fd5dd68e..8e0f8321 100644
--- a/test/routines/level2/xsbmv.hpp
+++ b/test/routines/level2/xsbmv.hpp
@@ -84,7 +84,7 @@ class TestXsbmv {
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp
index 63286248..977f733a 100644
--- a/test/routines/level2/xspmv.hpp
+++ b/test/routines/level2/xspmv.hpp
@@ -84,7 +84,7 @@ class TestXspmv {
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp
index f9dead53..93da4b73 100644
--- a/test/routines/level2/xspr.hpp
+++ b/test/routines/level2/xspr.hpp
@@ -79,7 +79,7 @@ class TestXspr {
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.ap_mat(), args.ap_offset,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp
index a2f22098..b835f2b0 100644
--- a/test/routines/level2/xspr2.hpp
+++ b/test/routines/level2/xspr2.hpp
@@ -84,7 +84,7 @@ class TestXspr2 {
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.ap_mat(), args.ap_offset,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp
index 0d3ca632..0ec96f1d 100644
--- a/test/routines/level2/xsymv.hpp
+++ b/test/routines/level2/xsymv.hpp
@@ -84,7 +84,7 @@ class TestXsymv {
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp
index 15ad9595..b49132e3 100644
--- a/test/routines/level2/xsyr.hpp
+++ b/test/routines/level2/xsyr.hpp
@@ -79,7 +79,7 @@ class TestXsyr {
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp
index a9a61a1f..7c65daa2 100644
--- a/test/routines/level2/xsyr2.hpp
+++ b/test/routines/level2/xsyr2.hpp
@@ -84,7 +84,7 @@ class TestXsyr2 {
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp
index 54e7fe18..cf30c2f7 100644
--- a/test/routines/level2/xtbmv.hpp
+++ b/test/routines/level2/xtbmv.hpp
@@ -78,7 +78,7 @@ class TestXtbmv {
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp
index 9776c4de..d08e132f 100644
--- a/test/routines/level2/xtpmv.hpp
+++ b/test/routines/level2/xtpmv.hpp
@@ -78,7 +78,7 @@ class TestXtpmv {
buffers.ap_mat(), args.ap_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp
index 18300e50..cf9a0063 100644
--- a/test/routines/level2/xtrmv.hpp
+++ b/test/routines/level2/xtrmv.hpp
@@ -78,7 +78,7 @@ class TestXtrmv {
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp
index 5f9bea81..bca3c049 100644
--- a/test/routines/level3/xgemm.hpp
+++ b/test/routines/level3/xgemm.hpp
@@ -92,7 +92,7 @@ class TestXgemm {
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp
index 8c44be25..31c7695f 100644
--- a/test/routines/level3/xhemm.hpp
+++ b/test/routines/level3/xhemm.hpp
@@ -92,7 +92,7 @@ class TestXhemm {
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp
index fd20bbb5..ff2bb6cb 100644
--- a/test/routines/level3/xher2k.hpp
+++ b/test/routines/level3/xher2k.hpp
@@ -91,7 +91,7 @@ class TestXher2k {
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp
index 12990d39..26396fa9 100644
--- a/test/routines/level3/xherk.hpp
+++ b/test/routines/level3/xherk.hpp
@@ -82,7 +82,7 @@ class TestXherk {
buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp
index f8e90927..c84c22b4 100644
--- a/test/routines/level3/xsymm.hpp
+++ b/test/routines/level3/xsymm.hpp
@@ -92,7 +92,7 @@ class TestXsymm {
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp
index 4e4ba0b7..5c4976e2 100644
--- a/test/routines/level3/xsyr2k.hpp
+++ b/test/routines/level3/xsyr2k.hpp
@@ -90,7 +90,7 @@ class TestXsyr2k {
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp
index f5509c88..98c4f6a4 100644
--- a/test/routines/level3/xsyrk.hpp
+++ b/test/routines/level3/xsyrk.hpp
@@ -82,7 +82,7 @@ class TestXsyrk {
buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp
index 45e17e45..55b51e54 100644
--- a/test/routines/level3/xtrmm.hpp
+++ b/test/routines/level3/xtrmm.hpp
@@ -82,7 +82,7 @@ class TestXtrmm {
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp
index 4637c07e..dccb3583 100644
--- a/test/routines/levelx/xomatcopy.hpp
+++ b/test/routines/levelx/xomatcopy.hpp
@@ -77,7 +77,7 @@ class TestXomatcopy {
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld,
&queue_plain, &event);
- clWaitForEvents(1, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}