diff options
113 files changed, 1878 insertions, 244 deletions
@@ -1,4 +1,14 @@ +Development version (next release) +- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header +- Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation +- Fixed a bug in the tests and samples related to waiting for an invalid event +- Added support for compilation under Visual Studio 2013 (MSVC++ 12.0) +- Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS +- Added an option to run tuned kernels multiple times to average execution times +- Various minor fixes and enhancements +- Added tuned parameters for various devices (see README) + Version 0.9.0 - Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header - Improved performance significantly of rotated GEMV computations diff --git a/CMakeLists.txt b/CMakeLists.txt index b422743f..9b028448 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,7 +135,8 @@ endif() # ================================================================================================== # Sets the supported routines and the used kernels. New routines and kernels should be added here. -set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv) +set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger + xgemm xgemm_direct xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) @@ -278,8 +279,9 @@ if(CLIENTS OR TESTS) set(REF_INCLUDES ) set(REF_LIBRARIES ) if(CLBLAS_FOUND) + find_package(Threads) + set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS}) - set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES}) if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") add_definitions(" /DCLBLAST_REF_CLBLAS") else() @@ -362,7 +364,7 @@ if(TESTS) test/correctness/tester.cpp test/correctness/testblas.cpp) target_include_directories(test_correctness_common PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> - ${clblast_SOURCE_DIR}) + ${clblast_SOURCE_DIR} ${REF_INCLUDES}) set(TESTS_COMMON ${TESTS_COMMON} $<TARGET_OBJECTS:test_correctness_common>) endif() @@ -1,14 +1,21 @@ +MIT License -Copyright (c) 2015 Cedric Nugteren +Copyright (c) 2016 Cedric Nugteren -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: - http://www.apache.org/licenses/LICENSE-2.0 +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. @@ -48,7 +48,7 @@ The pre-requisites for compilation of CLBlast are: - Clang 3.3 or newer - AppleClang 5.0 or newer - ICC 14.0 or newer - - MSVC (Visual Studio) 2015 or newer + - MSVC (Visual Studio) 2013 or newer * An OpenCL 1.1 or newer library, for example: - Apple OpenCL - NVIDIA CUDA SDK @@ -90,6 +90,8 @@ Afterwards, any of CLBlast's routines can be called directly: there is no need t cmake -DSAMPLES=ON .. +Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler. + Using the tuners (optional) ------------- @@ -117,8 +119,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC - Tahiti * Intel GPUs: - HD Graphics 530 - - HD Graphics Haswell Ultrabook GT2 Mobile - HD Graphics 5500 BroadWell U-Processor GT2 + - HD Graphics Haswell Ultrabook GT2 Mobile + - HD Graphics IvyBridge M GT2 - HD Graphics Skylake ULT GT2 - Iris - Iris Pro @@ -134,7 +137,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s cmake -DTUNERS=ON .. -Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher). +Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.5.0 or higher). Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake. diff --git a/doc/performance/GeForce_GTX480/SAXPY.pdf b/doc/performance/GeForce_GTX480/SAXPY.pdf Binary files differdeleted file mode 100644 index 6e1c8f5a..00000000 --- a/doc/performance/GeForce_GTX480/SAXPY.pdf +++ /dev/null diff --git a/doc/performance/GeForce_GTX480/SGEMM.pdf b/doc/performance/GeForce_GTX480/SGEMM.pdf Binary files differdeleted file mode 100644 index f430f880..00000000 --- a/doc/performance/GeForce_GTX480/SGEMM.pdf +++ /dev/null diff --git a/doc/performance/GeForce_GTX480/SGEMV.pdf b/doc/performance/GeForce_GTX480/SGEMV.pdf Binary files differdeleted file mode 100644 index 8cb57124..00000000 --- a/doc/performance/GeForce_GTX480/SGEMV.pdf +++ /dev/null diff --git a/doc/performance/GeForce_GTX480/SSYMM.pdf b/doc/performance/GeForce_GTX480/SSYMM.pdf Binary files differdeleted file mode 100644 index ff5941ad..00000000 --- a/doc/performance/GeForce_GTX480/SSYMM.pdf +++ /dev/null diff --git a/doc/performance/GeForce_GTX750Ti/SAXPY.pdf b/doc/performance/GeForce_GTX750Ti/SAXPY.pdf Binary files differnew file mode 100644 index 00000000..531baa79 --- /dev/null +++ b/doc/performance/GeForce_GTX750Ti/SAXPY.pdf diff --git a/doc/performance/GeForce_GTX750Ti/SGEMM.pdf b/doc/performance/GeForce_GTX750Ti/SGEMM.pdf Binary files differnew file mode 100644 index 00000000..dcd62929 --- /dev/null +++ b/doc/performance/GeForce_GTX750Ti/SGEMM.pdf diff --git a/doc/performance/GeForce_GTX750Ti/SGEMV.pdf b/doc/performance/GeForce_GTX750Ti/SGEMV.pdf Binary files differnew file mode 100644 index 00000000..a4c3efb3 --- /dev/null +++ b/doc/performance/GeForce_GTX750Ti/SGEMV.pdf diff --git a/doc/performance/GeForce_GTX750Ti/SSYMM.pdf b/doc/performance/GeForce_GTX750Ti/SSYMM.pdf Binary files differnew file mode 100644 index 00000000..43d97d24 --- /dev/null +++ b/doc/performance/GeForce_GTX750Ti/SSYMM.pdf diff --git a/doc/performance/Intel_IrisPro/SAXPY.pdf b/doc/performance/Intel_IrisPro/SAXPY.pdf Binary files differindex 3a51f306..8d639b24 100644 --- a/doc/performance/Intel_IrisPro/SAXPY.pdf +++ b/doc/performance/Intel_IrisPro/SAXPY.pdf diff --git a/doc/performance/Intel_IrisPro/SGEMM.pdf b/doc/performance/Intel_IrisPro/SGEMM.pdf Binary files differindex 15f1714f..31725025 100644 --- a/doc/performance/Intel_IrisPro/SGEMM.pdf +++ b/doc/performance/Intel_IrisPro/SGEMM.pdf diff --git a/doc/performance/Intel_IrisPro/SGEMV.pdf b/doc/performance/Intel_IrisPro/SGEMV.pdf Binary files differindex e1660999..9ec120c4 100644 --- a/doc/performance/Intel_IrisPro/SGEMV.pdf +++ b/doc/performance/Intel_IrisPro/SGEMV.pdf diff --git a/doc/performance/Radeon_M370X/SGEMM.pdf b/doc/performance/Radeon_M370X/SGEMM.pdf Binary files differindex 5dca8f03..da5722f9 100644 --- a/doc/performance/Radeon_M370X/SGEMM.pdf +++ b/doc/performance/Radeon_M370X/SGEMM.pdf diff --git a/doc/performance/Radeon_M370X/SGEMV.pdf b/doc/performance/Radeon_M370X/SGEMV.pdf Binary files differindex fa661249..513318bf 100644 --- a/doc/performance/Radeon_M370X/SGEMV.pdf +++ b/doc/performance/Radeon_M370X/SGEMV.pdf diff --git a/doc/performance/Radeon_M370X/SSYMM.pdf b/doc/performance/Radeon_M370X/SSYMM.pdf Binary files differindex 852181d1..03efd198 100644 --- a/doc/performance/Radeon_M370X/SSYMM.pdf +++ b/doc/performance/Radeon_M370X/SSYMM.pdf diff --git a/include/clblast_half.h b/include/clblast_half.h index 269a520e..05d96f9f 100644 --- a/include/clblast_half.h +++ b/include/clblast_half.h @@ -25,6 +25,11 @@ #include <CL/opencl.h> #endif +// MSVC 2013 doesn't fully support C99 +#ifdef _MSC_VER + #define inline __inline +#endif + // ================================================================================================= // Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type, diff --git a/samples/cache.c b/samples/cache.c index a592824d..abc8ad4b 100644 --- a/samples/cache.c +++ b/samples/cache.c @@ -112,8 +112,10 @@ void run_example_routine(const cl_device_id device) { &queue, &event); // Wait for completion - clWaitForEvents(1, &event); - clReleaseEvent(event); + if (status == kSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } // Retrieves the execution time clock_t diff = clock() - start; diff --git a/samples/dgemv.c b/samples/dgemv.c index c22c9f37..a15d649a 100644 --- a/samples/dgemv.c +++ b/samples/dgemv.c @@ -84,8 +84,10 @@ int main(void) { &queue, &event); // Wait for completion - clWaitForEvents(1, &event); - clReleaseEvent(event); + if (status == kSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } // Example completed. See "clblast_c.h" for status codes (0 -> success). printf("Completed DGEMV with status %d\n", status); diff --git a/samples/haxpy.c b/samples/haxpy.c index d5b98e12..5bab3d42 100644 --- a/samples/haxpy.c +++ b/samples/haxpy.c @@ -77,8 +77,10 @@ int main(void) { &queue, &event); // Wait for completion - clWaitForEvents(1, &event); - clReleaseEvent(event); + if (status == kSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } // Copies the result back to the host clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL); diff --git a/samples/sasum.c b/samples/sasum.c index 1518cc13..02f924b0 100644 --- a/samples/sasum.c +++ b/samples/sasum.c @@ -73,8 +73,10 @@ int main(void) { &queue, &event); // Wait for completion - clWaitForEvents(1, &event); - clReleaseEvent(event); + if (status == kSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } // Copies the result back to the host clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL); diff --git a/samples/sgemm.c b/samples/sgemm.c index b4827777..583fc261 100644 --- a/samples/sgemm.c +++ b/samples/sgemm.c @@ -87,8 +87,10 @@ int main(void) { &queue, &event); // Wait for completion - clWaitForEvents(1, &event); - clReleaseEvent(event); + if (status == kSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } // Example completed. See "clblast_c.h" for status codes (0 -> success). printf("Completed SGEMM with status %d\n", status); diff --git a/samples/sgemm.cpp b/samples/sgemm.cpp index a4b89968..401ecff8 100644 --- a/samples/sgemm.cpp +++ b/samples/sgemm.cpp @@ -95,8 +95,10 @@ int main() { &queue_plain, &event); // Record the execution time - clWaitForEvents(1, &event); - clReleaseEvent(event); + if (status == clblast::StatusCode::kSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } auto elapsed_time = std::chrono::steady_clock::now() - start_time; auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count(); diff --git a/scripts/database/database.py b/scripts/database/database.py index f758a2b7..31f313da 100755 --- a/scripts/database/database.py +++ b/scripts/database/database.py @@ -18,7 +18,7 @@ import database.bests as bests import database.defaults as defaults # Server storing a copy of the database -DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.json" +DATABASE_SERVER_URL = "https://raw.githubusercontent.com/CNugteren/CLBlast-database/master/database.json" # OpenCL vendor names and their short name VENDOR_TRANSLATION_TABLE = { diff --git a/scripts/graphs/common.r b/scripts/graphs/common.r index cd68cf26..e5dad616 100644 --- a/scripts/graphs/common.r +++ b/scripts/graphs/common.r @@ -31,8 +31,12 @@ options("width"=170) # ================================================================================================== -# Constants -num_runs <- 4 +# Settings +num_runs <- 5 +num_runs_short <- 50 +xtics_subset_threshold <- 100 +xtics_subset_stepsize <- 8 + devices <- c("-platform","-device") options_string <- "-q -no_abbrv -cblas 0" library_names <- c("CLBlast", "clBLAS") @@ -66,11 +70,21 @@ main <- function(routine_name, precision, test_names, test_values, executable <- paste("./clblast_client_", routine_name, sep="") # Configures the outputfile - pdf(paste(display_name, ".pdf", sep=""), height=8, width=13) - par(mfrow=c(2, 3)) - par(oma=c(0, 0, 0, 0)) - par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)] - par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)] + file_name <- paste(display_name, ".pdf", sep="") + if (length(test_names) == 6) { + pdf(file_name, height=8, width=13) + par(mfrow=c(2, 3)) + par(oma=c(0, 0, 0, 0)) + par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)] + par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)] + } + else { # length(test_names) == 2 + pdf(file_name, height=8, width=13) + par(mfrow=c(2, 1)) + par(oma=c(0, 0, 0, 0)) + par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)] + par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)] + } # Loops over the test-cases for (test_id in 1:length(test_names)) { @@ -169,7 +183,12 @@ plot_graph <- function(xdata, ydata, log_setting, main="", xlab="", ylab="", ylim=c(ymin, ymax), xlim=c(xmin, xmax), axes=F, "n") axis(side=2, las=2) - axis(side=1, at=xdata, labels=xtics, las=2) + if (length(xdata) > xtics_subset_threshold) { # Too many indices to print, plot only every Nth + subset <- seq(from=1, to=length(xdata), by=xtics_subset_stepsize) + axis(side=1, at=xdata[subset], labels=xtics[subset], las=2) + } else { + axis(side=1, at=xdata, labels=xtics, las=2) + } title(xlab=xlabel, line=-1) title(ylab=ylabel, line=2) title(graph_title, line=-2) diff --git a/scripts/graphs/xgemm_small.r b/scripts/graphs/xgemm_small.r new file mode 100644 index 00000000..ef94ef20 --- /dev/null +++ b/scripts/graphs/xgemm_small.r @@ -0,0 +1,56 @@ + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project uses a tab-size of two spaces and a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren <www.cedricnugteren.nl> +# +# This file implements the performance script for small sizes of Xgemm, testing the direct kernel +# +# ================================================================================================== + +# Includes the common functions +args <- commandArgs(trailingOnly = FALSE) +thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)]))) +source(file.path(dirname(thisfile), "common.r")) + +# ================================================================================================== + +# Settings +routine_name <- "xgemm" +parameters <- c("-m","-n","-k","-layout","-transA","-transB", + "-num_steps","-step","-runs","-precision") +precision <- 32 + +# Sets the names of the test-cases +test_names <- list( + "small matrices in steps of 16", + "small matrices in steps of 1" +) + +# Defines the test-cases +test_values <- list( + list(c( 128, 128, 128, 102, 111, 111, 57, 16, num_runs_short, precision)), + list(c( 128, 128, 128, 102, 111, 111, 385, 1, num_runs_short, precision)) +) + +# Defines the x-labels corresponding to the test-cases +test_xlabels <- list( + "matrix sizes (m=n=k)", + "matrix sizes (m=n=k)" +) + +# Defines the x-axis of the test-cases +test_xaxis <- list( + c("m", ""), + c("m", "") +) + +# ================================================================================================== + +# Start the script +main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values, + test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE) + +# ==================================================================================================
\ No newline at end of file diff --git a/scripts/graphs/xsymm.r b/scripts/graphs/xsymm.r index a65bb16f..89d137d2 100644 --- a/scripts/graphs/xsymm.r +++ b/scripts/graphs/xsymm.r @@ -35,32 +35,32 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)), - list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)), - list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)), - list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)), + list(c( 128, 128, 102, 141, 121, 16, 128, num_runs, precision)), + list(c( 129, 129, 102, 141, 121, 16, 128, num_runs, precision)), + list(c( 512, 512, 102, 141, 121, 16, 1, num_runs, precision)), + list(c(2048, 2048, 102, 141, 121, 16, 1, num_runs, precision)), list( - c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision), - c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision), - c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision), - c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision), - c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), - c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision), - c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision), - c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision) + c(1024, 1024, 101, 141, 121, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 122, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 121, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 122, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 122, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 121, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 122, 1, 0, num_runs, precision) ), list( - c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision), - c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision), - c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision), - c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision), - c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision), - c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision), - c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision), - c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), - c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision), - c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision), - c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision) + c( 8, 8, 102, 141, 121, 1, 0, num_runs, precision), + c( 16, 16, 102, 141, 121, 1, 0, num_runs, precision), + c( 32, 32, 102, 141, 121, 1, 0, num_runs, precision), + c( 64, 64, 102, 141, 121, 1, 0, num_runs, precision), + c( 128, 128, 102, 141, 121, 1, 0, num_runs, precision), + c( 256, 256, 102, 141, 121, 1, 0, num_runs, precision), + c( 512, 512, 102, 141, 121, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision), + c(2048, 2048, 102, 141, 121, 1, 0, num_runs, precision), + c(4096, 4096, 102, 141, 121, 1, 0, num_runs, precision), + c(8192, 8192, 102, 141, 121, 1, 0, num_runs, precision) ) ) diff --git a/scripts/graphs/xsyrk.r b/scripts/graphs/xsyrk.r index 4ab46c9f..754c93e2 100644 --- a/scripts/graphs/xsyrk.r +++ b/scripts/graphs/xsyrk.r @@ -35,32 +35,32 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)), - list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)), - list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)), - list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)), + list(c( 128, 128, 102, 121, 111, 16, 128, num_runs, precision)), + list(c( 129, 129, 102, 121, 111, 16, 128, num_runs, precision)), + list(c( 512, 512, 102, 121, 111, 16, 1, num_runs, precision)), + list(c(2048, 2048, 102, 121, 111, 16, 1, num_runs, precision)), list( - c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision), - c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision), - c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision), - c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision), - c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), - c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision), - c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision), - c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision) + c(1024, 1024, 101, 121, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 121, 112, 1, 0, num_runs, precision), + c(1024, 1024, 101, 122, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 122, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 121, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 122, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 122, 112, 1, 0, num_runs, precision) ), list( - c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision), - c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision), - c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision), - c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision), - c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision), - c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision), - c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision), - c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), - c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision), - c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision), - c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision) + c( 8, 8, 102, 121, 111, 1, 0, num_runs, precision), + c( 16, 16, 102, 121, 111, 1, 0, num_runs, precision), + c( 32, 32, 102, 121, 111, 1, 0, num_runs, precision), + c( 64, 64, 102, 121, 111, 1, 0, num_runs, precision), + c( 128, 128, 102, 121, 111, 1, 0, num_runs, precision), + c( 256, 256, 102, 121, 111, 1, 0, num_runs, precision), + c( 512, 512, 102, 121, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision), + c(2048, 2048, 102, 121, 111, 1, 0, num_runs, precision), + c(4096, 4096, 102, 121, 111, 1, 0, num_runs, precision), + c(8192, 8192, 102, 121, 111, 1, 0, num_runs, precision) ) ) diff --git a/src/clpp11.hpp b/src/clpp11.hpp index d57223dd..aaa76cb4 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -12,8 +12,8 @@ // Portability here means that a similar header exists for CUDA with the same classes and // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change. // -// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore -// contains the following header copyright notice: +// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and +// therefore contains the following header copyright notice: // // ================================================================================================= // @@ -97,14 +97,12 @@ class Event { // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx float GetElapsedTime() const { WaitForCompletion(); - auto bytes = size_t{0}; - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); - auto time_start = size_t{0}; + const auto bytes = sizeof(cl_ulong); + auto time_start = cl_ulong{0}; clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); - auto time_end = size_t{0}; + auto time_end = cl_ulong{0}; clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); - return (time_end - time_start) * 1.0e-6f; + return static_cast<float>(time_end - time_start) * 1.0e-6f; } // Accessor to the private data-member @@ -152,6 +150,17 @@ class Platform { cl_platform_id platform_; }; +// Retrieves a vector with all platforms +inline std::vector<Platform> GetAllPlatforms() { + auto num_platforms = cl_uint{0}; + CheckError(clGetPlatformIDs(0, nullptr, &num_platforms)); + auto all_platforms = std::vector<Platform>(); + for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) { + all_platforms.push_back(Platform(platform_id)); + } + return all_platforms; +} + // ================================================================================================= // C++11 version of 'cl_device_id' @@ -201,8 +210,8 @@ class Device { std::vector<size_t> MaxWorkItemSizes() const { return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES); } - cl_ulong LocalMemSize() const { - return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE); + unsigned long LocalMemSize() const { + return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE)); } std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); } size_t CoreClock() const { @@ -238,9 +247,11 @@ class Device { // Query for a specific type of device or brand bool IsCPU() const { return Type() == "CPU"; } bool IsGPU() const { return Type() == "GPU"; } - bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; } + bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." || + Vendor() == "AuthenticAMD";; } bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; } - bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; } + bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" || + Vendor() == "GenuineIntel"; } bool IsARM() const { return Vendor() == "ARM"; } // Accessor to the private data-member @@ -606,8 +617,7 @@ class Buffer { // Retrieves the actual allocated size in bytes size_t GetSize() const { - auto bytes = size_t{0}; - CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes)); + const auto bytes = sizeof(size_t); auto result = size_t{0}; CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr)); return result; @@ -658,17 +668,16 @@ class Kernel { } // Retrieves the amount of local memory used per work-group for this kernel - cl_ulong LocalMemUsage(const Device &device) const { - auto bytes = size_t{0}; + unsigned long LocalMemUsage(const Device &device) const { + const auto bytes = sizeof(cl_ulong); auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE}; - CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes)); auto result = cl_ulong{0}; CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr)); - return result; + return static_cast<unsigned long>(result); } // Retrieves the name of the kernel - std::string GetFunctionName() { + std::string GetFunctionName() const { auto bytes = size_t{0}; CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes)); auto result = std::string{}; @@ -689,6 +698,7 @@ class Kernel { void Launch(const Queue &queue, const std::vector<size_t> &global, const std::vector<size_t> &local, EventPointer event, const std::vector<Event> &waitForEvents) { + // Builds a plain version of the events waiting list auto waitForEventsPlain = std::vector<cl_event>(); for (auto &waitEvent : waitForEvents) { diff --git a/src/database/database.cpp b/src/database/database.cpp index 34c44a29..1198cefb 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -21,10 +21,12 @@ #include "database/kernels/xgemv_fast_rot.hpp" #include "database/kernels/xger.hpp" #include "database/kernels/xgemm.hpp" +#include "database/kernels/xgemm_direct.hpp" #include "database/kernels/copy.hpp" #include "database/kernels/pad.hpp" #include "database/kernels/transpose.hpp" #include "database/kernels/padtranspose.hpp" +#include "database/kernel_selection.hpp" namespace clblast { // ================================================================================================= @@ -38,10 +40,29 @@ const std::vector<Database::DatabaseEntry> Database::database = { XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble, XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble, XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, + XgemmDirectHalf, XgemmDirectSingle, XgemmDirectDouble, XgemmDirectComplexSingle, XgemmDirectComplexDouble, CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble, TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble, - PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble + PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble, + KernelSelectionHalf, KernelSelectionSingle, KernelSelectionDouble, KernelSelectionComplexSingle, KernelSelectionComplexDouble +}; + +// The OpenCL device types +const std::string Database::kDeviceTypeCPU = "CPU"; +const std::string Database::kDeviceTypeGPU = "GPU"; +const std::string Database::kDeviceTypeAccelerator = "accelerator"; +const std::string Database::kDeviceTypeAll = "default"; + +// The OpenCL device vendors +const std::string Database::kDeviceVendorAll = "default"; + +// Alternative names for some OpenCL vendors +const std::unordered_map<std::string, std::string> Database::kVendorNames{ + { "Intel(R) Corporation", "Intel" }, + { "GenuineIntel", "Intel" }, + { "Advanced Micro Devices, Inc.", "AMD" }, + { "NVIDIA Corporation", "NVIDIA" }, }; // ================================================================================================= diff --git a/src/database/database.hpp b/src/database/database.hpp index a6ab49c5..346fe089 100644 --- a/src/database/database.hpp +++ b/src/database/database.hpp @@ -36,36 +36,31 @@ class Database { // Structures for content inside the database struct DatabaseDevice { - const std::string name; - const Parameters parameters; + std::string name; + Parameters parameters; }; struct DatabaseVendor { - const std::string type; - const std::string name; - const std::vector<DatabaseDevice> devices; + std::string type; + std::string name; + std::vector<DatabaseDevice> devices; }; struct DatabaseEntry { - const std::string kernel; - const Precision precision; - const std::vector<DatabaseVendor> vendors; + std::string kernel; + Precision precision; + std::vector<DatabaseVendor> vendors; }; // The OpenCL device types - static constexpr auto kDeviceTypeCPU = "CPU"; - static constexpr auto kDeviceTypeGPU = "GPU"; - static constexpr auto kDeviceTypeAccelerator = "accelerator"; - static constexpr auto kDeviceTypeAll = "default"; + static const std::string kDeviceTypeCPU; + static const std::string kDeviceTypeGPU; + static const std::string kDeviceTypeAccelerator; + static const std::string kDeviceTypeAll; // The OpenCL device vendors - static constexpr auto kDeviceVendorAll = "default"; + static const std::string kDeviceVendorAll; // Alternative names for some OpenCL vendors - const std::unordered_map<std::string,std::string> kVendorNames { - {"Intel(R) Corporation", "Intel"}, - {"GenuineIntel", "Intel"}, - {"Advanced Micro Devices, Inc.", "AMD"}, - {"NVIDIA Corporation", "NVIDIA"}, - }; + static const std::unordered_map<std::string, std::string> kVendorNames; // The database consists of separate database entries, stored together in a vector static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble; @@ -75,10 +70,12 @@ class Database { static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble; static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble; static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble; + static const DatabaseEntry XgemmDirectHalf, XgemmDirectSingle, XgemmDirectDouble, XgemmDirectComplexSingle, XgemmDirectComplexDouble; static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble; static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble; static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble; static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble; + static const DatabaseEntry KernelSelectionHalf, KernelSelectionSingle, KernelSelectionDouble, KernelSelectionComplexSingle, KernelSelectionComplexDouble; static const std::vector<DatabaseEntry> database; // The constructor with a user-provided database overlay (potentially an empty vector) diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp new file mode 100644 index 00000000..c9462c7a --- /dev/null +++ b/src/database/kernel_selection.hpp @@ -0,0 +1,129 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel +// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the +// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use +// some common default values. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::KernelSelectionHalf = { + "KernelSelection", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::KernelSelectionSingle = { + "KernelSelection", Precision::kSingle, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::KernelSelectionComplexSingle = { + "KernelSelection", Precision::kComplexSingle, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::KernelSelectionDouble = { + "KernelSelection", Precision::kDouble, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::KernelSelectionComplexDouble = { + "KernelSelection", Precision::kComplexDouble, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp index a6b7dfe8..479c7f78 100644 --- a/src/database/kernels/copy.hpp +++ b/src/database/kernels/copy.hpp @@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::CopySingle = { { "Intel(R) HD Graphics 530", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, @@ -84,7 +85,7 @@ const Database::DatabaseEntry Database::CopySingle = { { "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "GeForce GTX 750", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } }, { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, @@ -128,6 +129,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = { { "Intel(R) HD Graphics 530", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } }, @@ -147,7 +149,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = { { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 750", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, - { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } }, @@ -205,13 +207,13 @@ const Database::DatabaseEntry Database::CopyDouble = { { "GeForce GTX 670", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX 750", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } }, } }, { // Default @@ -264,7 +266,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = { { "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 750", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp index 3cfabaf4..48085139 100644 --- a/src/database/kernels/pad.hpp +++ b/src/database/kernels/pad.hpp @@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::PadSingle = { { "Intel(R) HD Graphics 530", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, @@ -84,7 +85,7 @@ const Database::DatabaseEntry Database::PadSingle = { { "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } }, - { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, @@ -134,10 +135,11 @@ const Database::DatabaseEntry Database::PadComplexSingle = { { "Intel(R) HD Graphics 530", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, { "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, } }, { // Intel accelerators @@ -154,13 +156,13 @@ const Database::DatabaseEntry Database::PadComplexSingle = { { "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // Default @@ -272,7 +274,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = { { "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 750", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp index 88bd4ea7..f9448985 100644 --- a/src/database/kernels/padtranspose.hpp +++ b/src/database/kernels/padtranspose.hpp @@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = { { "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, @@ -134,6 +135,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = { { "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, @@ -272,7 +274,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = { { "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, { "GeForce GTX 750", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp index 0e1b608e..191d2e98 100644 --- a/src/database/kernels/transpose.hpp +++ b/src/database/kernels/transpose.hpp @@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::TransposeSingle = { { "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, @@ -134,6 +135,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = { { "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, @@ -159,7 +161,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, } }, } @@ -207,7 +209,7 @@ const Database::DatabaseEntry Database::TransposeDouble = { { "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "GeForce GTX 750", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp index 9c1bcd99..70be5abc 100644 --- a/src/database/kernels/xaxpy.hpp +++ b/src/database/kernels/xaxpy.hpp @@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::XaxpySingle = { { "Intel(R) HD Graphics 530", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",1}, {"WGS",512}, {"WPT",2} } }, { "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } }, @@ -84,7 +85,7 @@ const Database::DatabaseEntry Database::XaxpySingle = { { "GeForce GTX 670", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 680", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX 750", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, + { "GeForce GTX 750 Ti", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, { "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, @@ -134,6 +135,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = { { "Intel(R) HD Graphics 530", { {"VW",4}, {"WGS",64}, {"WPT",2} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",2}, {"WGS",512}, {"WPT",1} } }, { "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } }, @@ -213,7 +215,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = { { "GeForce GTX 670", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 750", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } }, { "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } }, { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, @@ -272,7 +274,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = { { "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 750", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, - { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } }, + { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",2} } }, { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, { "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } }, { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp index 987a990d..96a699aa 100644 --- a/src/database/kernels/xdot.hpp +++ b/src/database/kernels/xdot.hpp @@ -55,6 +55,7 @@ const Database::DatabaseEntry Database::XdotSingle = { { "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WGS2",32} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",512}, {"WGS2",128} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",32} } }, { "Iris Pro", { {"WGS1",512}, {"WGS2",64} } }, { "default", { {"WGS1",64}, {"WGS2",32} } }, @@ -68,6 +69,7 @@ const Database::DatabaseEntry Database::XdotSingle = { { "GeForce GTX 670", { {"WGS1",512}, {"WGS2",1024} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } }, { "GeForce GTX 750", { {"WGS1",128}, {"WGS2",32} } }, + { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } }, { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } }, @@ -106,6 +108,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = { { "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WGS2",32} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",512}, {"WGS2",32} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } }, { "Iris Pro", { {"WGS1",32}, {"WGS2",32} } }, { "default", { {"WGS1",32}, {"WGS2",32} } }, @@ -119,6 +122,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = { { "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } }, { "GeForce GTX 750", { {"WGS1",64}, {"WGS2",32} } }, + { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } }, { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, @@ -160,6 +164,7 @@ const Database::DatabaseEntry Database::XdotDouble = { { "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } }, { "GeForce GTX 750", { {"WGS1",64}, {"WGS2",256} } }, + { "GeForce GTX 750 Ti", { {"WGS1",128}, {"WGS2",64} } }, { "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, @@ -201,6 +206,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = { { "GeForce GTX 670", { {"WGS1",512}, {"WGS2",128} } }, { "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } }, { "GeForce GTX 750", { {"WGS1",256}, {"WGS2",32} } }, + { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } }, { "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } }, { "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } }, diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp index d19c55b5..ffe3dc57 100644 --- a/src/database/kernels/xgemm.hpp +++ b/src/database/kernels/xgemm.hpp @@ -57,9 +57,10 @@ const Database::DatabaseEntry Database::XgemmSingle = { { "Intel(R) HD Graphics 530", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, { "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, - { "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, + { "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, @@ -77,7 +78,7 @@ const Database::DatabaseEntry Database::XgemmSingle = { { "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, { "GeForce GTX 750", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } }, - { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } }, + { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",2} } }, { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } }, { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } }, @@ -127,6 +128,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = { { "Intel(R) HD Graphics 530", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, { "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, @@ -147,7 +149,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = { { "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX 750", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, - { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, @@ -206,7 +208,7 @@ const Database::DatabaseEntry Database::XgemmDouble = { { "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, { "GeForce GTX 750", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, - { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, + { "GeForce GTX 750 Ti", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, { "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, @@ -265,7 +267,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = { { "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } }, { "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX 750", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, - { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "GeForce GTX 750 Ti", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp new file mode 100644 index 00000000..53dfeaa6 --- /dev/null +++ b/src/database/kernels/xgemm_direct.hpp @@ -0,0 +1,136 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator <database.py> +// +// This file populates the database with best-found tuning parameters for the 'Xgemm_Direct' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmDirectHalf = { + "XgemmDirect", Precision::kHalf, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmDirectSingle = { + "XgemmDirect", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Iris Pro", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmDirectComplexSingle = { + "XgemmDirect", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Iris Pro", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 750 Ti", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmDirectDouble = { + "XgemmDirect", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmDirectComplexDouble = { + "XgemmDirect", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp index e5e8845e..be6606a6 100644 --- a/src/database/kernels/xgemv.hpp +++ b/src/database/kernels/xgemv.hpp @@ -57,6 +57,7 @@ const Database::DatabaseEntry Database::XgemvSingle = { { "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WPT1",1} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WPT1",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } }, { "Iris", { {"WGS1",64}, {"WPT1",2} } }, { "Iris Pro", { {"WGS1",256}, {"WPT1",2} } }, @@ -77,7 +78,7 @@ const Database::DatabaseEntry Database::XgemvSingle = { { "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } }, { "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1} } }, { "GeForce GTX 750", { {"WGS1",256}, {"WPT1",1} } }, - { "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1} } }, + { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } }, { "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1} } }, { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1} } }, @@ -120,6 +121,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = { { "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WPT1",1} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WPT1",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } }, { "Iris", { {"WGS1",256}, {"WPT1",1} } }, { "Iris Pro", { {"WGS1",64}, {"WPT1",1} } }, @@ -140,7 +142,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = { { "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } }, { "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1} } }, { "GeForce GTX 750", { {"WGS1",128}, {"WPT1",1} } }, - { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } }, { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } }, { "default", { {"WGS1",64}, {"WPT1",1} } }, } @@ -188,7 +190,7 @@ const Database::DatabaseEntry Database::XgemvDouble = { { "GeForce GTX 670", { {"WGS1",128}, {"WPT1",1} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1} } }, { "GeForce GTX 750", { {"WGS1",64}, {"WPT1",1} } }, - { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } }, { "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1} } }, { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } }, { "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1} } }, diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp index 52af628c..cd430dcb 100644 --- a/src/database/kernels/xgemv_fast.hpp +++ b/src/database/kernels/xgemv_fast.hpp @@ -57,10 +57,11 @@ const Database::DatabaseEntry Database::XgemvFastSingle = { { "Intel(R) HD Graphics 530", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "Iris", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } }, { "Iris Pro", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } }, - { "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } }, } }, { // Intel accelerators @@ -77,7 +78,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = { { "GeForce GTX 670", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } }, { "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, { "GeForce GTX 750", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, - { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX 750 Ti", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } }, { "GeForce GTX 980", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, @@ -120,6 +121,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = { { "Intel(R) HD Graphics 530", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Iris", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Iris Pro", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } }, @@ -139,7 +141,6 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = { { "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "GeForce GTX 670", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "GeForce GTX 680", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, - { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, } }, @@ -186,7 +187,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = { { "GeForce GTX 670", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, { "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, { "GeForce GTX 750", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } }, - { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } }, { "GeForce GTX 980", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp index 328094e1..2dd7db32 100644 --- a/src/database/kernels/xgemv_fast_rot.hpp +++ b/src/database/kernels/xgemv_fast_rot.hpp @@ -44,6 +44,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = { kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } }, { "Iris Pro", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } }, { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, @@ -51,8 +52,9 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = { }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 750 Ti", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, { "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, - { "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, } }, { // Default @@ -83,6 +85,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = { kDeviceTypeGPU, "Intel", { { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } }, { "Iris Pro", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, { "default", { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } }, @@ -114,8 +117,9 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = { }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 750 Ti", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } }, { "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, - { "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } }, } }, { // Default diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp index 3e9c25c1..7816080f 100644 --- a/src/database/kernels/xger.hpp +++ b/src/database/kernels/xger.hpp @@ -63,6 +63,7 @@ const Database::DatabaseEntry Database::XgerSingle = { { "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } }, { "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } }, { "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, @@ -76,8 +77,9 @@ const Database::DatabaseEntry Database::XgerSingle = { { "GeForce GTX 670", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, { "GeForce GTX 750", { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } }, + { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, + { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, } }, { // Default @@ -120,6 +122,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = { { "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } }, { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } }, + { "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } }, { "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } }, { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, @@ -133,6 +136,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = { { "GeForce GTX 670", { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } }, { "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, { "GeForce GTX 750", { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } }, + { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } }, { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } }, } @@ -180,8 +184,9 @@ const Database::DatabaseEntry Database::XgerDouble = { { "GeForce GTX 670", { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } }, { "GeForce GTX 750", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } }, + { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } }, { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, - { "default", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } }, + { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } }, } }, { // Default @@ -227,6 +232,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = { { "GeForce GTX 670", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } }, { "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } }, { "GeForce GTX 750", { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } }, + { "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } }, { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, { "default", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, } diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 223501fd..b0817242 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -204,7 +204,7 @@ R"( #if PRECISION == 3232 || PRECISION == 6464 #define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y #else - #define COMPLEX_CONJUGATE(value) value = value + #define COMPLEX_CONJUGATE(value) #endif // ================================================================================================= diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl new file mode 100644 index 00000000..a8bd450e --- /dev/null +++ b/src/kernels/level3/xgemm_direct_part1.opencl @@ -0,0 +1,273 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any +// pre and and post-processing kernels. +// +// This kernel is seperated into three files. This is part 1 out of 3. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. Note that all parameters here have a +// suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel. +#ifndef WGD + #define WGD 8 // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64) +#endif +#ifndef MDIMCD + #define MDIMCD 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) +#endif +#ifndef NDIMCD + #define NDIMCD 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) +#endif +#ifndef MDIMAD + #define MDIMAD 8 // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD +#endif +#ifndef NDIMBD + #define NDIMBD 8 // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD +#endif +#ifndef KWID + #define KWID 1 // Unroll factor of the WGD loop (smaller or equal than WGD) +#endif +#ifndef VWMD + #define VWMD 1 // Vector width of matrices A and C +#endif +#ifndef VWND + #define VWND 1 // Vector width of matrix B +#endif +#ifndef PADA + #define PADA 1 // Local memory padding for matrix A +#endif +#ifndef PADB + #define PADB 1 // Local memory padding for matrix B +#endif + +// Helper parameters based on the above tuning parameters +#define MWID (WGD/MDIMCD) // Work per work-item (M-dimension) +#define NWID (WGD/NDIMCD) // Work per work-item (N-dimension) +#define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD +#define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD +#define MWAD (WGD/MDIMAD) // Amount of loads-per-thread for matrix A (M-dimension) +#define KWAD (WGD/KDIMAD) // Amount of loads-per-thread for matrix A (K-dimension) +#define KWBD (WGD/KDIMBD) // Amount of loads-per-thread for matrix B (K-dimension) +#define NWBD (WGD/NDIMBD) // Amount of loads-per-thread for matrix B (N-dimension) + +// ================================================================================================= + +// Data-widths in dimension M +#if VWMD == 1 + typedef real realMD; +#elif VWMD == 2 + typedef real2 realMD; +#elif VWMD == 4 + typedef real4 realMD; +#elif VWMD == 8 + typedef real8 realMD; +#elif VWMD == 16 + typedef real16 realMD; +#endif + +// Data-widths in dimension N +#if VWND == 1 + typedef real realND; +#elif VWND == 2 + typedef real2 realND; +#elif VWND == 4 + typedef real4 realND; +#elif VWND == 8 + typedef real8 realND; +#elif VWND == 16 + typedef real16 realND; +#endif + +// ================================================================================================= + +// Initializes the accumulation registers to zero +inline void InitAccRegistersDirect(real cpm[NWID][MWID]) { + #pragma unroll + for (int mi=0; mi<MWID; ++mi) { + #pragma unroll + for (int ni=0; ni<NWID; ++ni) { + SetToZero(cpm[ni][mi]); + } + } +} + +// ================================================================================================= + +// Performs the actual computation: Cpm += Apm * Bpm +inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) { + #pragma unroll + for (int ni=0; ni<NWID; ++ni) { + #pragma unroll + for (int mi=0; mi<MWID; ++mi) { + MultiplyAdd(cpm[ni][mi], apm[mi], bpm[ni]); + } + } +} + +// ================================================================================================= + +// Loads global off-chip memory into thread-private register files. This function is specific for +// loading the A input matrix. +inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID], + const int a_ld, const int a_offset, const int idm, const int idk, + const int a_transpose, const int a_conjugate) { + #pragma unroll + for (int mi=0; mi<MWID; ++mi) { + const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi); + apm[mi] = agms[a_index + a_offset]; + if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); } + } +} + +// Same as above, but now for the B input matrix +inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID], + const int b_ld, const int b_offset, const int idn, const int idk, + const int b_transpose, const int b_conjugate) { + #pragma unroll + for (int ni=0; ni<NWID; ++ni) { + const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni); + bpm[ni] = bgms[b_index + b_offset]; + if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); } + } +} + +// Loads global off-chip memory into thread-private register files. This function is specific for +// loading the A input matrix. This is the same as above but now includes a bounds check. +inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID], + const int a_ld, const int a_offset, const int idm, const int idk, + const int a_transpose, const int a_conjugate, + const int kSizeM) { + #pragma unroll + for (int mi=0; mi<MWID; ++mi) { + if (idm + mi < kSizeM) { + const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi); + apm[mi] = agms[a_index + a_offset]; + if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); } + } + else { + SetToZero(apm[mi]); + } + } +} + +// Same as above, but now for the B input matrix +inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID], + const int b_ld, const int b_offset, const int idn, const int idk, + const int b_transpose, const int b_conjugate, + const int kSizeN) { + #pragma unroll + for (int ni=0; ni<NWID; ++ni) { + if (idn + ni < kSizeN) { + const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni); + bpm[ni] = bgms[b_index + b_offset]; + if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); } + } + else { + SetToZero(bpm[ni]); + } + } +} + +// ================================================================================================= + +// Caches on-chip local memory into per-thread private memory (registers). This function is specific +// for caching the A input matrix. +inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg, + const int a_transpose) { + #pragma unroll + for (int mi=0; mi<MWID; ++mi) { + const int mg = mi + get_local_id(0)*MWID; + const int index = (a_transpose) ? mg*(WGD + PADA) + kg : kg*(WGD + PADA) + mg; + apm[mi] = alm[index]; + } +} + +// Same as above, but now for the B input matrix +inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg, + const int b_transpose) { + #pragma unroll + for (int ni=0; ni<NWID; ++ni) { + const int ng = ni + get_local_id(1)*NWID; + const int index = (b_transpose) ? ng*(WGD + PADB) + kg : kg*(WGD + PADB) + ng; + bpm[ni] = blm[index]; + } +} + +// ================================================================================================= + +// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication +// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm +inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID], + const int idm, const int idn, + const real alpha, const real beta, + const int c_ld, const int c_offset, const int c_transpose) { + #pragma unroll + for (int ni=0; ni<NWID; ++ni) { + #pragma unroll + for (int mi=0; mi<MWID; ++mi) { + + // Determines the destination index + int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi); + + // The final multiplication with alpha (in case beta == 0) + real result; + if (IsZero(beta)) { + Multiply(result, alpha, cpm[ni][mi]); + } + // The final multiplication with alpha and the addition with beta*C + else { + AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]); + } + cgm[c_index + c_offset] = result; + } + } +} + +// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication +// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm +inline void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID], + const int idm, const int idn, const int kSizeM, const int kSizeN, + const real alpha, const real beta, + const int c_ld, const int c_offset, const int c_transpose) { + #pragma unroll + for (int ni=0; ni<NWID; ++ni) { + #pragma unroll + for (int mi=0; mi<MWID; ++mi) { + if ((idm + mi) < kSizeM && (idn + ni) < kSizeN) { + + // Determines the destination index + int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi); + + // The final multiplication with alpha (in case beta == 0) + real result; + if (IsZero(beta)) { + Multiply(result, alpha, cpm[ni][mi]); + } + // The final multiplication with alpha and the addition with beta*C + else { + AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]); + } + cgm[c_index + c_offset] = result; + } + } + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl new file mode 100644 index 00000000..d77cbf65 --- /dev/null +++ b/src/kernels/level3/xgemm_direct_part2.opencl @@ -0,0 +1,314 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This is part 2 of 3 of the GEMM kernel. See part 1 for more information. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for +// caching the A input matrix. +inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, + const int a_ld, const int a_offset, const int kwg, + const int a_transpose, const int a_conjugate) { + #if MDIMCD == MDIMAD + const int la0 = get_local_id(0); + const int la1 = get_local_id(1); + #else + const int tid = get_local_id(0) + MDIMCD*get_local_id(1); + const int la0 = tid % MDIMAD; + const int la1 = tid / MDIMAD; + #endif + #pragma unroll + for (int mia=0; mia<MWAD/VWMD; ++mia) { + #pragma unroll + for (int kia=0; kia<KWAD; ++kia) { + + // Computes the indices for the global memory + int mg = mia + la0*(MWAD/VWMD); + int kg = kia + la1*KWAD; + int idm = (a_transpose) ? mg + kwg/VWMD : mg + GetGroupID0()*(WGD/VWMD); + int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg; + + // Loads the data from global memory into the local memory + const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset]; + #if VWMD == 1 + alm[kg*(WGD + PADA) + mg] = avec; + #elif VWMD == 2 + alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x; + alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y; + #elif VWMD == 4 + alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x; + alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y; + alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.z; + alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.w; + #elif VWMD == 8 + alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0; + alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1; + alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2; + alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3; + alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4; + alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5; + alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6; + alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7; + #elif VWMD == 16 + alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0; + alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1; + alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2; + alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3; + alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4; + alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5; + alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6; + alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7; + alm[kg*(WGD + PADA) + mg*VWMD + 8] = avec.s8; + alm[kg*(WGD + PADA) + mg*VWMD + 9] = avec.s9; + alm[kg*(WGD + PADA) + mg*VWMD + 10] = avec.sA; + alm[kg*(WGD + PADA) + mg*VWMD + 11] = avec.sB; + alm[kg*(WGD + PADA) + mg*VWMD + 12] = avec.sC; + alm[kg*(WGD + PADA) + mg*VWMD + 13] = avec.sD; + alm[kg*(WGD + PADA) + mg*VWMD + 14] = avec.sE; + alm[kg*(WGD + PADA) + mg*VWMD + 15] = avec.sF; + #endif + if (a_conjugate) { + for (int vm=0; vm<VWMD; ++vm) { + COMPLEX_CONJUGATE(alm[kg*(WGD + PADA) + mg*VWMD + vm]); + } + } + } + } +} + +// Same as above, but now for the B input matrix +inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm, + const int b_ld, const int b_offset, const int kwg, + const int b_transpose, const int b_conjugate) { + #if MDIMCD == NDIMBD + const int lb0 = get_local_id(0); + const int lb1 = get_local_id(1); + #else + const int tid = get_local_id(0) + MDIMCD*get_local_id(1); + const int lb0 = tid % NDIMBD; + const int lb1 = tid / NDIMBD; + #endif + #pragma unroll + for (int kib=0; kib<KWBD; ++kib) { + #pragma unroll + for (int nib=0; nib<NWBD/VWND; ++nib) { + + // Computes the indices for the global memory + int ng = nib + lb0*(NWBD/VWND); + int kg = kib + lb1*KWBD; + int idn = (b_transpose) ? ng + kwg/VWND : ng + GetGroupID1()*(WGD/VWND); + int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg; + + // Loads the data from global memory into the local memory + const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset]; + #if VWND == 1 + blm[kg*(WGD + PADB) + ng] = bvec; + #elif VWND == 2 + blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x; + blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y; + #elif VWND == 4 + blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x; + blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y; + blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.z; + blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.w; + #elif VWND == 8 + blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0; + blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1; + blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2; + blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3; + blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4; + blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5; + blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6; + blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7; + #elif VWND == 16 + blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0; + blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1; + blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2; + blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3; + blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4; + blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5; + blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6; + blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7; + blm[kg*(WGD + PADB) + ng*VWND + 8] = bvec.s8; + blm[kg*(WGD + PADB) + ng*VWND + 9] = bvec.s9; + blm[kg*(WGD + PADB) + ng*VWND + 10] = bvec.sA; + blm[kg*(WGD + PADB) + ng*VWND + 11] = bvec.sB; + blm[kg*(WGD + PADB) + ng*VWND + 12] = bvec.sC; + blm[kg*(WGD + PADB) + ng*VWND + 13] = bvec.sD; + blm[kg*(WGD + PADB) + ng*VWND + 14] = bvec.sE; + blm[kg*(WGD + PADB) + ng*VWND + 15] = bvec.sF; + #endif + if (b_conjugate) { + for (int vn=0; vn<VWND; ++vn) { + COMPLEX_CONJUGATE(blm[kg*(WGD + PADB) + ng*VWND + vn]); + } + } + } + } +} + +// ================================================================================================= + +// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for +// caching the A input matrix. In contrast to the functions above, this function performs doesn't +// use the vector data-types. +inline void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm, + const int a_ld, const int a_offset, const int kwg, + const int a_transpose, const int a_conjugate) { + #if MDIMCD == MDIMAD + const int la0 = get_local_id(0); + const int la1 = get_local_id(1); + #else + const int tid = get_local_id(0) + MDIMCD*get_local_id(1); + const int la0 = tid % MDIMAD; + const int la1 = tid / MDIMAD; + #endif + #pragma unroll + for (int mia=0; mia<MWAD; ++mia) { + #pragma unroll + for (int kia=0; kia<KWAD; ++kia) { + + // Computes the indices for the global memory + int mg = mia + la0*MWAD; + int kg = kia + la1*KWAD; + int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD; + int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg; + + // Loads the data from global memory into the local memory + real result = agms[idk*a_ld + idm + a_offset]; + if (a_conjugate) { COMPLEX_CONJUGATE(result); } + alm[kg*(WGD + PADA) + mg] = result; + } + } +} + +// Same as above, but now for the B input matrix +inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm, + const int b_ld, const int b_offset, const int kwg, + const int b_transpose, const int b_conjugate) { + #if MDIMCD == NDIMBD + const int lb0 = get_local_id(0); + const int lb1 = get_local_id(1); + #else + const int tid = get_local_id(0) + MDIMCD*get_local_id(1); + const int lb0 = tid % NDIMBD; + const int lb1 = tid / NDIMBD; + #endif + #pragma unroll + for (int kib=0; kib<KWBD; ++kib) { + #pragma unroll + for (int nib=0; nib<NWBD; ++nib) { + + // Computes the indices for the global memory + int ng = nib + lb0*NWBD; + int kg = kib + lb1*KWBD; + int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD; + int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg; + + // Loads the data from global memory into the local memory + real result = bgms[idk*b_ld + idn + b_offset]; + if (b_conjugate) { COMPLEX_CONJUGATE(result); } + blm[kg*(WGD + PADB) + ng] = result; + } + } +} + +// ================================================================================================= + +// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for +// caching the A input matrix. In contrast to the functions above, this function performs bounds +// checks and doesn't use the vector data-types. +inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm, + const int a_ld, const int a_offset, const int kwg, + const int a_transpose, const int a_conjugate, + const int kSizeM, const int kSizeK) { + #if MDIMCD == MDIMAD + const int la0 = get_local_id(0); + const int la1 = get_local_id(1); + #else + const int tid = get_local_id(0) + MDIMCD*get_local_id(1); + const int la0 = tid % MDIMAD; + const int la1 = tid / MDIMAD; + #endif + #pragma unroll + for (int mia=0; mia<MWAD; ++mia) { + #pragma unroll + for (int kia=0; kia<KWAD; ++kia) { + + // Computes the indices for the global memory + int mg = mia + la0*MWAD; + int kg = kia + la1*KWAD; + int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD; + int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg; + + // Loads the data from global memory into the local memory + int condition = (a_transpose) ? idm < kSizeK : idm < kSizeM; + if (condition) { + real result = agms[idk*a_ld + idm + a_offset]; + if (a_conjugate) { COMPLEX_CONJUGATE(result); } + alm[kg*(WGD + PADA) + mg] = result; + } + else { + SetToZero(alm[kg*(WGD + PADA) + mg]); + } + } + } +} + +// Same as above, but now for the B input matrix +inline void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm, + const int b_ld, const int b_offset, const int kwg, + const int b_transpose, const int b_conjugate, + const int kSizeN, const int kSizeK) { + #if MDIMCD == NDIMBD + const int lb0 = get_local_id(0); + const int lb1 = get_local_id(1); + #else + const int tid = get_local_id(0) + MDIMCD*get_local_id(1); + const int lb0 = tid % NDIMBD; + const int lb1 = tid / NDIMBD; + #endif + #pragma unroll + for (int kib=0; kib<KWBD; ++kib) { + #pragma unroll + for (int nib=0; nib<NWBD; ++nib) { + + // Computes the indices for the global memory + int ng = nib + lb0*NWBD; + int kg = kib + lb1*KWBD; + int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD; + int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg; + + // Loads the data from global memory into the local memory + int condition = (b_transpose) ? idn < kSizeK : idn < kSizeN; + if (condition) { + real result = bgms[idk*b_ld + idn + b_offset]; + if (b_conjugate) { COMPLEX_CONJUGATE(result); } + blm[kg*(WGD + PADB) + ng] = result; + } + else { + SetToZero(blm[kg*(WGD + PADB) + ng]); + } + } + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl new file mode 100644 index 00000000..a9350e00 --- /dev/null +++ b/src/kernels/level3/xgemm_direct_part3.opencl @@ -0,0 +1,214 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This is part 3 of 3 of the GEMM kernel. See part 1 for more information. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Main body of the kernel. This is the direct version without pre/post processing and restrictions. +inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + __local real* alm, __local real* blm, + const int a_transpose, const int b_transpose, const int c_transpose, + const int a_conjugate, const int b_conjugate) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Extra pointers to scalar versions of global memory + const __global real* restrict agms = (const __global real* restrict) agm; + const __global real* restrict bgms = (const __global real* restrict) bgm; + + // Allocates workitem-private memory (registers) + real apm[MWID]; + real bpm[NWID]; + real cpm[NWID][MWID]; + + // Initializes the accumulation registers + InitAccRegistersDirect(cpm); + + // The faster version of GEMM is not allowed on the (incomplete) borders. Therefore, this section + // processes only the main parts: output blocks of WGD by WGD. + const int idm = get_local_id(0) * MWID + GetGroupID0() * WGD; + const int idn = get_local_id(1) * NWID + GetGroupID1() * WGD; + if ((idm < (kSizeM/WGD)*WGD) && (idn < (kSizeN/WGD)*WGD)) { + + // Loops over all complete workgroup tiles (K-dimension) + int kwg = 0; + for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) { + + // Loads data: off-chip --> local (matrix A and B) + if (a_ld % VWMD == 0) { + GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); + } + else { + GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); + } + if (b_ld % VWND == 0) { + GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); + } + else { + GlobalToLocalScalarB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Loops over all workitem tiles, unrolled by a factor KWID + for (int pwi=0; pwi<WGD; pwi+=KWID) { + #pragma unroll + for (int pit=0; pit<KWID; ++pit) { + int kg = pwi + pit; + + // Loads data: local --> private (matrix A and B) + LocalToPrivateDirectA(alm, apm, kg, a_transpose); + LocalToPrivateDirectB(blm, bpm, kg, b_transpose); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Loop over the remaining part (incomplete tile in K-dimension) + for (; kwg < kSizeK; ++kwg) { + + // Loads data: off-chip --> private (matrix A and B) + GlobalToPrivateDirectA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate); + GlobalToPrivateDirectB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + + // Stores a tile of results and performs the multiplication with alpha and beta + StoreResultsDirect(cgm, cpm, idm, idn, alpha, beta, c_ld, c_offset, c_transpose); + } + + // Simple but slower version for the parts on the edge (incomplete tiles in M and N-dimensions) + else { + + // Loops over all complete workgroup tiles (K-dimension) + int kwg = 0; + for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) { + + // Loads data: off-chip --> local (matrix A and B) + GlobalToLocalCheckedA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate, kSizeM, kSizeK); + GlobalToLocalCheckedB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate, kSizeN, kSizeK); + barrier(CLK_LOCAL_MEM_FENCE); + + // Loops over all workitem tiles, unrolled by a factor KWID + for (int pwi=0; pwi<WGD; pwi+=KWID) { + #pragma unroll + for (int pit=0; pit<KWID; ++pit) { + int kg = pwi + pit; + + // Loads data: local --> private (matrix A and B) + LocalToPrivateDirectA(alm, apm, kg, a_transpose); + LocalToPrivateDirectB(blm, bpm, kg, b_transpose); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Loop over the remaining part (incomplete tile in K-dimension) + for (; kwg < kSizeK; ++kwg) { + + // Loads data: off-chip --> private (matrix A and B) + GlobalToPrivateCheckedA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate, kSizeM); + GlobalToPrivateCheckedB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate, kSizeN); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + + // Stores a tile of results and performs the multiplication with alpha and beta + StoreResultsChecked(cgm, cpm, idm, idn, kSizeM, kSizeN, alpha, beta, c_ld, c_offset, c_transpose); + } +} + +// ================================================================================================= + +// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate); +} + +// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate); +} + +// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate); +} + +// Direct version of the GEMM kernel with [A, B] = [transposed, transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate); +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/msvc.hpp b/src/msvc.hpp new file mode 100644 index 00000000..a45105df --- /dev/null +++ b/src/msvc.hpp @@ -0,0 +1,39 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file provides macro's and definitions to make compilation work on Microsoft Visual Studio, +// in particular for versions older than 2015 with limited C++11 support. +// MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015) +// MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013) +// MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012) +// MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010) +// MSVC++ 9.0 _MSC_VER == 1500 (Visual Studio 2008) +// +// ================================================================================================= + +#ifndef CLBLAST_MSVC_HPP_ +#define CLBLAST_MSVC_HPP_ + +namespace clblast { +// ================================================================================================= +#ifdef _MSC_VER + +// No support for constexpr prior to 2015. Note that this only works with constants, not with +// constexpr functions (unused in this project). +#if _MSC_VER < 1900 +#define constexpr const +#endif + +// _MSC_VER +#endif +// ================================================================================================= +} // namespace clblast + +// CLBLAST_MSVC_HPP_ +#endif diff --git a/src/routine.cpp b/src/routine.cpp index 189ae190..d938d66f 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -14,6 +14,7 @@ #include <string> #include <vector> #include <chrono> +#include <cstdlib> #include "routine.hpp" @@ -42,13 +43,19 @@ StatusCode Routine::SetUp() { // Queries the cache to see whether or not the program (context-specific) is already there if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; } + // Sets the build options from an environmental variable (if set) + auto options = std::vector<std::string>(); + const auto environment_variable = std::getenv("CLBLAST_BUILD_OPTIONS"); + if (environment_variable != nullptr) { + options.push_back(std::string(environment_variable)); + } + // Queries the cache to see whether or not the binary (device-specific) is already there. If it // is, a program is created and stored in the cache if (BinaryIsInCache(device_name_, precision_, routine_name_)) { try { auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_); auto program = Program(device_, context_, binary); - auto options = std::vector<std::string>(); program.Build(device_, options); StoreProgramToCache(program, context_, precision_, routine_name_); } catch (...) { return StatusCode::kBuildProgramFailure; } @@ -115,7 +122,6 @@ StatusCode Routine::SetUp() { // Compiles the kernel try { auto program = Program(context_, source_string); - auto options = std::vector<std::string>(); const auto build_status = program.Build(device_, options); // Checks for compiler crashes/errors/warnings diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 0b8e768f..1602c69f 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -22,7 +22,9 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) { + Routine(queue, event, name, + {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"}, + PrecisionValue<T>()) { source_string_ = #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" @@ -32,10 +34,16 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/convert_symmetric.opencl" #include "../../kernels/level3/convert_triangular.opencl" #include "../../kernels/level3/convert_hermitian.opencl" + #include "../../kernels/level3/xgemm_direct_part1.opencl" + #include "../../kernels/level3/xgemm_direct_part2.opencl" + #include "../../kernels/level3/xgemm_direct_part3.opencl" + ; + auto source_string_part_2 = // separated in two parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" ; + source_string_ += source_string_part_2; } // ================================================================================================= @@ -98,6 +106,44 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld); if (ErrorIn(status)) { return status; } + // Selects which version of GEMM to run + const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]); + if (do_gemm_direct) { // for small sizes (single kernel) + return GemmDirect(m, n, k, alpha, + a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, + c_buffer, c_offset, c_ld, + a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate); + } + else { // for larger sizes (pre/post-processing plus a very fast kernel) + return GemmIndirect(m, n, k, alpha, + a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, + c_buffer, c_offset, c_ld, + a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, + a_one, a_two, a_want_rotated, + b_one, b_two, b_want_rotated, + c_one, c_two, c_want_rotated); + } +} + +// ================================================================================================= + +// The indirect version of GEMM. This uses the faster but non-general kernel. It has specific +// requirements, but several pre and post-processing kernels take care of those. However, the +// overhead of these extra kernels might not be ideal for certain devices/arguments. +template <typename T> +StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate, + const size_t a_one, const size_t a_two, const bool a_want_rotated, + const size_t b_one, const size_t b_two, const bool b_want_rotated, + const size_t c_one, const size_t c_two, const bool c_want_rotated) { + auto status = StatusCode::kSuccess; + // Calculates the ceiled versions of m, n, and k const auto m_ceiled = Ceil(m, db_["MWG"]); const auto n_ceiled = Ceil(n, db_["NWG"]); @@ -217,6 +263,66 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, } catch (...) { return StatusCode::kTempBufferAllocFailure; } } + +// ================================================================================================= + +// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels. +template <typename T> +StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate) { + + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); + + // Retrieves the proper XgemmDirect kernel from the compiled binary + try { + const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : + (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); + auto kernel = Kernel(program, name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(m)); + kernel.SetArgument(1, static_cast<int>(n)); + kernel.SetArgument(2, static_cast<int>(k)); + kernel.SetArgument(3, GetRealArg(alpha)); + kernel.SetArgument(4, GetRealArg(beta)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast<int>(a_offset)); + kernel.SetArgument(7, static_cast<int>(a_ld)); + kernel.SetArgument(8, b_buffer()); + kernel.SetArgument(9, static_cast<int>(b_offset)); + kernel.SetArgument(10, static_cast<int>(b_ld)); + kernel.SetArgument(11, c_buffer()); + kernel.SetArgument(12, static_cast<int>(c_offset)); + kernel.SetArgument(13, static_cast<int>(c_ld)); + kernel.SetArgument(14, static_cast<int>(c_do_transpose)); + kernel.SetArgument(15, static_cast<int>(a_conjugate)); + kernel.SetArgument(16, static_cast<int>(b_conjugate)); + + // Computes the global and local thread sizes + const auto m_ceiled = Ceil(m, db_["WGD"]); + const auto n_ceiled = Ceil(n, db_["WGD"]); + const auto global = std::vector<size_t>{ + (m_ceiled * db_["MDIMCD"]) / db_["WGD"], + (n_ceiled * db_["NDIMCD"]) / db_["WGD"] + }; + const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]}; + + // Launches the kernel + auto status = RunKernel(kernel, queue_, device_, global, local, event_); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + // ================================================================================================= // Compiles the templated class diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp index bc51c7f5..46e12453 100644 --- a/src/routines/level3/xgemm.hpp +++ b/src/routines/level3/xgemm.hpp @@ -35,6 +35,29 @@ class Xgemm: public Routine { const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld); + + // Indirect version of GEMM (with pre and post-processing kernels) + StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate, + const size_t a_one, const size_t a_two, const bool a_want_rotated, + const size_t b_one, const size_t b_two, const bool b_want_rotated, + const size_t c_one, const size_t c_two, const bool c_want_rotated); + + // Direct version of GEMM (no pre and post-processing kernels) + StatusCode GemmDirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate); }; // ================================================================================================= diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp index 78ded56e..c57aab39 100644 --- a/src/tuning/kernels/copy_fast.cpp +++ b/src/tuning/kernels/copy_fast.cpp @@ -47,6 +47,7 @@ class TuneCopy { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp index 90f5ea82..9486ee8d 100644 --- a/src/tuning/kernels/copy_pad.cpp +++ b/src/tuning/kernels/copy_pad.cpp @@ -47,6 +47,7 @@ class TunePad { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp index 10fa80cb..2d9d5e49 100644 --- a/src/tuning/kernels/transpose_fast.cpp +++ b/src/tuning/kernels/transpose_fast.cpp @@ -47,6 +47,7 @@ class TuneTranspose { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp index 507718eb..d364dabe 100644 --- a/src/tuning/kernels/transpose_pad.cpp +++ b/src/tuning/kernels/transpose_pad.cpp @@ -47,6 +47,7 @@ class TunePadTranspose { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index 0033b3c6..403ee9e4 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -51,6 +51,7 @@ class TuneXaxpy { static size_t DefaultN() { return 4096*1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { return args.n; } diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp index 1581e13f..f8416761 100644 --- a/src/tuning/kernels/xdot.cpp +++ b/src/tuning/kernels/xdot.cpp @@ -47,6 +47,7 @@ class TuneXdot { static size_t DefaultN() { return 2*1024*1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { return args.n; } diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 4cb7fd00..0eb1875b 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -52,6 +52,7 @@ class TuneXgemm { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1024; } static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel @@ -126,10 +127,10 @@ class TuneXgemm { // Sets the local memory size static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { auto LocalMemorySize = [args] (std::vector<size_t> v) { - return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision)); + return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision)); }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM", - "SB", "KWG", "NWG", "VWN"}); + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", + "SB", "KWG", "NWG"}); } // Sets the base thread configuration diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp new file mode 100644 index 00000000..204e0be4 --- /dev/null +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -0,0 +1,196 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file uses the CLTune auto-tuner to tune the direct xgemm kernels. There are two variations: +// - V==1: This tests some limited set of tuning parameters exhaustively. +// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset. +// +// ================================================================================================= + +#include <string> +#include <vector> + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T, int V> +class TuneXgemmDirect { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; } + static std::string KernelName() { return "XgemmDirectTN"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/xgemm_direct_part1.opencl" + #include "../src/kernels/level3/xgemm_direct_part2.opencl" + #include "../src/kernels/level3/xgemm_direct_part3.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction}; + } + + // Tests for valid arguments + static void TestValidArguments(const Arguments<T> &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 256; } + static size_t DefaultN() { return 256; } + static size_t DefaultK() { return 256; } + static double DefaultFraction() { return (V==1) ? 1.0 : 32.0; } // test all or sample randomly + static size_t DefaultNumRuns() { return 4; } // run every kernel this many times for averaging + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.k; } + static size_t GetSizeB(const Arguments<T> &args) { return args.n * args.k; } + static size_t GetSizeC(const Arguments<T> &args) { return args.m * args.n; } + static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + if (V==1) { // limited subset of tuning parameters - but explorable exhaustively + tuner.AddParameter(id, "WGD", {8, 16, 32}); + tuner.AddParameter(id, "MDIMCD", {8, 16, 32}); + tuner.AddParameter(id, "NDIMCD", {8, 16, 32}); + tuner.AddParameter(id, "MDIMAD", {8, 16, 32}); + tuner.AddParameter(id, "NDIMBD", {8, 16, 32}); + tuner.AddParameter(id, "KWID", {2}); + tuner.AddParameter(id, "VWMD", {1, 2, 4, 8}); + tuner.AddParameter(id, "VWND", {1, 2, 4, 8}); + tuner.AddParameter(id, "PADA", {1}); + tuner.AddParameter(id, "PADB", {1}); + } // a lot more tuning parameters - has to be sampled randomly, too much to test all + else { + tuner.AddParameter(id, "WGD", {8, 16, 32, 64, 128}); + tuner.AddParameter(id, "MDIMCD", {8, 16, 32}); + tuner.AddParameter(id, "NDIMCD", {8, 16, 32}); + tuner.AddParameter(id, "MDIMAD", {8, 16, 32}); + tuner.AddParameter(id, "NDIMBD", {8, 16, 32}); + tuner.AddParameter(id, "KWID", {2, 8, 16}); + tuner.AddParameter(id, "VWMD", {1, 2, 4, 8}); + tuner.AddParameter(id, "VWND", {1, 2, 4, 8}); + tuner.AddParameter(id, "PADA", {0, 1}); + tuner.AddParameter(id, "PADB", {0, 1}); + } + } + + // Sets the constraints + static void SetConstraints(cltune::Tuner &tuner, const size_t id) { + auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; + auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); }; + auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; + // Requirement for unrolling the WGD loop + tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"}); + // Required for integer MWID and NWID + tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}); + tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}); + // Required for integer MWIAD and NWIBD + tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}); + tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}); + // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...) + tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}); + tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}); + + // Extra constraints for variation 1 to limit the set of options significantly + if (V==1) { + auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; }; + tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"}); + tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"}); + } + } + + // Sets the local memory size + static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { + auto LocalMemorySize = [args] (std::vector<size_t> v) { + return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"}); + } + + // Sets the base thread configuration + static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; } + static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); } + static std::vector<size_t> LocalSize() { return {1, 1}; } + static std::vector<size_t> LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector<std::vector<std::string>>; + static TransformVector MulLocal() { return {{"MDIMCD", "NDIMCD"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {{"MDIMCD", "NDIMCD"}}; } + static TransformVector DivGlobal() { return {{"WGD", "WGD"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, + std::vector<T> &, std::vector<T> &, + std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat, + std::vector<T> &) { + tuner.AddArgumentScalar(static_cast<int>(args.m)); + tuner.AddArgumentScalar(static_cast<int>(args.n)); + tuner.AddArgumentScalar(static_cast<int>(args.k)); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); + tuner.AddArgumentScalar(GetRealArg(args.beta)); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(0); // a_offset + tuner.AddArgumentScalar(static_cast<int>(args.k)); // a_ld + tuner.AddArgumentInput(b_mat); + tuner.AddArgumentScalar(0); // b_offset + tuner.AddArgumentScalar(static_cast<int>(args.n)); // b_ld + tuner.AddArgumentOutput(c_mat); + tuner.AddArgumentScalar(0); // c_offset + tuner.AddArgumentScalar(static_cast<int>(args.n)); // c_ld + tuner.AddArgumentScalar(1); // c_do_transpose + tuner.AddArgumentScalar(0); // a_conjugate + tuner.AddArgumentScalar(0); // b_conjugate + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments<T> &args) { + return 2 * args.m * args.n * args.k; + } + static std::string PerformanceUnit() { return "GFLOPS"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Function to tune a specific variation V (not within the clblast namespace) +template <int V> +void StartVariation(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemmDirect<half,V>, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemmDirect<float,V>, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemmDirect<double,V>, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemmDirect<float2,V>, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemmDirect<double2,V>, double2>(argc, argv); break; + } +} + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index 7229602d..f332f52a 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -50,6 +50,7 @@ class TuneXgemv { static size_t DefaultN() { return 2048; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { return args.n; } diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp index 1fb5c531..c3d0c7dd 100644 --- a/src/tuning/kernels/xger.cpp +++ b/src/tuning/kernels/xger.cpp @@ -47,6 +47,7 @@ class TuneXger { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { return args.m; } diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp index 19df5f9a..afb092bc 100644 --- a/src/tuning/tuning.hpp +++ b/src/tuning/tuning.hpp @@ -30,6 +30,7 @@ namespace clblast { // that it is automatically compiled for the various kernels (given as the 'C' template argument). template <typename C, typename T> void Tuner(int argc, char* argv[]) { + constexpr auto kSeed = 42; // fixed seed for reproducibility // Sets the parameters and platform/device for which to tune (command-line options) auto help = std::string{"* Options given/available:\n"}; @@ -45,6 +46,8 @@ void Tuner(int argc, char* argv[]) { if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); } if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); } } + const auto num_runs = GetArgument(argc, argv, help, kArgNumRuns, C::DefaultNumRuns()); + fprintf(stdout, "%s\n", help.c_str()); // Tests validity of the given arguments @@ -73,12 +76,12 @@ void Tuner(int argc, char* argv[]) { auto b_mat = std::vector<T>(C::GetSizeB(args)); auto c_mat = std::vector<T>(C::GetSizeC(args)); auto temp = std::vector<T>(C::GetSizeTemp(args)); - PopulateVector(x_vec); - PopulateVector(y_vec); - PopulateVector(a_mat); - PopulateVector(b_mat); - PopulateVector(c_mat); - PopulateVector(temp); + PopulateVector(x_vec, kSeed); + PopulateVector(y_vec, kSeed); + PopulateVector(a_mat, kSeed); + PopulateVector(b_mat, kSeed); + PopulateVector(c_mat, kSeed); + PopulateVector(temp, kSeed); // Initializes the tuner for the chosen device cltune::Tuner tuner(args.platform_id, args.device_id); @@ -126,6 +129,7 @@ void Tuner(int argc, char* argv[]) { C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp); // Starts the tuning process + tuner.SetNumRuns(num_runs); tuner.Tune(); // Prints the results to screen @@ -134,7 +138,7 @@ void Tuner(int argc, char* argv[]) { // Also prints the performance of the best-case in terms of GB/s or GFLOPS if (time_ms != 0.0) { - printf("[ -------> ] %.1lf ms", time_ms); + printf("[ -------> ] %.2lf ms", time_ms); printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str()); } diff --git a/src/utilities.cpp b/src/utilities.cpp index 77bc72d7..86cc2d13 100644 --- a/src/utilities.cpp +++ b/src/utilities.cpp @@ -270,40 +270,40 @@ unsigned int GetRandomSeed() { // Create a random number generator and populates a vector with samples from a random distribution template <typename T> -void PopulateVector(std::vector<T> &vector) { +void PopulateVector(std::vector<T> &vector, const unsigned int seed) { auto lower_limit = static_cast<T>(kTestDataLowerLimit); auto upper_limit = static_cast<T>(kTestDataUpperLimit); - std::mt19937 mt(GetRandomSeed()); + std::mt19937 mt(seed); std::uniform_real_distribution<T> dist(lower_limit, upper_limit); for (auto &element: vector) { element = dist(mt); } } -template void PopulateVector<float>(std::vector<float>&); -template void PopulateVector<double>(std::vector<double>&); +template void PopulateVector<float>(std::vector<float>&, const unsigned int); +template void PopulateVector<double>(std::vector<double>&, const unsigned int); // Specialized versions of the above for complex data-types template <> -void PopulateVector(std::vector<float2> &vector) { +void PopulateVector(std::vector<float2> &vector, const unsigned int seed) { auto lower_limit = static_cast<float>(kTestDataLowerLimit); auto upper_limit = static_cast<float>(kTestDataUpperLimit); - std::mt19937 mt(GetRandomSeed()); + std::mt19937 mt(seed); std::uniform_real_distribution<float> dist(lower_limit, upper_limit); for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } } template <> -void PopulateVector(std::vector<double2> &vector) { +void PopulateVector(std::vector<double2> &vector, const unsigned int seed) { auto lower_limit = static_cast<double>(kTestDataLowerLimit); auto upper_limit = static_cast<double>(kTestDataUpperLimit); - std::mt19937 mt(GetRandomSeed()); + std::mt19937 mt(seed); std::uniform_real_distribution<double> dist(lower_limit, upper_limit); for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } } // Specialized versions of the above for half-precision template <> -void PopulateVector(std::vector<half> &vector) { +void PopulateVector(std::vector<half> &vector, const unsigned int seed) { const auto lower_limit = static_cast<float>(kTestDataLowerLimit); const auto upper_limit = static_cast<float>(kTestDataUpperLimit); - std::mt19937 mt(GetRandomSeed()); + std::mt19937 mt(seed); std::uniform_real_distribution<float> dist(lower_limit, upper_limit); for (auto &element: vector) { element = FloatToHalf(dist(mt)); } } diff --git a/src/utilities.hpp b/src/utilities.hpp index 75bd5a69..038a8a96 100644 --- a/src/utilities.hpp +++ b/src/utilities.hpp @@ -25,6 +25,8 @@ #include "clblast_half.h" #include "clpp11.hpp" +#include "msvc.hpp" + namespace clblast { // ================================================================================================= @@ -206,7 +208,7 @@ bool CheckArgument(const int argc, char *argv[], std::string &help, const std::s // ================================================================================================= // Helper function to check for errors in the status code -constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); } +inline bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); } // ================================================================================================= @@ -219,7 +221,7 @@ constexpr auto kTestDataUpperLimit = 2.0; // Populates a vector with random data template <typename T> -void PopulateVector(std::vector<T> &vector); +void PopulateVector(std::vector<T> &vector, const unsigned int seed); // ================================================================================================= diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp index 2e751255..fc908b9e 100644 --- a/test/correctness/testblas.cpp +++ b/test/correctness/testblas.cpp @@ -66,13 +66,13 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent, c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset); scalar_source_.resize(std::max(max_mat, max_matvec) + max_offset); - PopulateVector(x_source_); - PopulateVector(y_source_); - PopulateVector(a_source_); - PopulateVector(b_source_); - PopulateVector(c_source_); - PopulateVector(ap_source_); - PopulateVector(scalar_source_); + PopulateVector(x_source_, kSeed); + PopulateVector(y_source_, kSeed); + PopulateVector(a_source_, kSeed); + PopulateVector(b_source_, kSeed); + PopulateVector(c_source_, kSeed); + PopulateVector(ap_source_, kSeed); + PopulateVector(scalar_source_, kSeed); } // =============================================================================================== diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp index d01cd06c..4b773801 100644 --- a/test/correctness/testblas.hpp +++ b/test/correctness/testblas.hpp @@ -30,6 +30,7 @@ namespace clblast { template <typename T, typename U> class TestBlas: public Tester<T,U> { public: + static constexpr auto kSeed = 42; // fixed seed for reproducibility // Uses several variables from the Tester class using Tester<T,U>::context_; diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp index 362c5c2c..41e457b6 100644 --- a/test/correctness/tester.cpp +++ b/test/correctness/tester.cpp @@ -358,28 +358,33 @@ void Tester<T,U>::PrintErrorLog(const std::vector<ErrorLogEntry> &error_log) { // Compares two floating point values and returns whether they are within an acceptable error // margin. This replaces GTest's EXPECT_NEAR(). template <typename T> -bool TestSimilarity(const T val1, const T val2) { +bool TestSimilarityNear(const T val1, const T val2, + const T error_margin_absolute, const T error_margin_relative) { const auto difference = std::fabs(val1 - val2); - // Set the allowed error margin for floating-point comparisons - constexpr auto kErrorMarginRelative = T(0.025); - constexpr auto kErrorMarginAbsolute = T(1.0e-3); - // Shortcut, handles infinities if (val1 == val2) { return true; } // The values are zero or very small: the relative error is less meaningful - else if (val1 == 0 || val2 == 0 || difference < kErrorMarginAbsolute) { - return (difference < kErrorMarginAbsolute); + else if (val1 == 0 || val2 == 0 || difference < error_margin_absolute) { + return (difference < error_margin_absolute); } // Use relative error else { const auto absolute_sum = std::fabs(val1) + std::fabs(val2); - return (difference / absolute_sum) < kErrorMarginRelative; + return (difference / absolute_sum) < error_margin_relative; } } +// Default method for similarity testing +template <typename T> +bool TestSimilarity(const T val1, const T val2) { + constexpr auto kErrorMarginRelative = T(0.025); + constexpr auto kErrorMarginAbsolute = T(0.001); + return TestSimilarityNear(val1, val2, kErrorMarginRelative, kErrorMarginAbsolute); +} + // Compiles the default case for standard data-types template bool TestSimilarity<float>(const float, const float); template bool TestSimilarity<double>(const double, const double); @@ -399,7 +404,10 @@ bool TestSimilarity(const double2 val1, const double2 val2) { } template <> bool TestSimilarity(const half val1, const half val2) { - return TestSimilarity(HalfToFloat(val1), HalfToFloat(val2)); + constexpr auto kErrorMarginRelative = float(0.050); + constexpr auto kErrorMarginAbsolute = float(0.002); + return TestSimilarityNear(HalfToFloat(val1), HalfToFloat(val2), + kErrorMarginRelative, kErrorMarginAbsolute); } // ================================================================================================= diff --git a/test/performance/client.cpp b/test/performance/client.cpp index aaaab22e..cbb10d10 100644 --- a/test/performance/client.cpp +++ b/test/performance/client.cpp @@ -178,13 +178,13 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) std::vector<T> c_source(args.c_size); std::vector<T> ap_source(args.ap_size); std::vector<T> scalar_source(args.scalar_size); - PopulateVector(x_source); - PopulateVector(y_source); - PopulateVector(a_source); - PopulateVector(b_source); - PopulateVector(c_source); - PopulateVector(ap_source); - PopulateVector(scalar_source); + PopulateVector(x_source, kSeed); + PopulateVector(y_source, kSeed); + PopulateVector(a_source, kSeed); + PopulateVector(b_source, kSeed); + PopulateVector(c_source, kSeed); + PopulateVector(ap_source, kSeed); + PopulateVector(scalar_source, kSeed); // Creates the matrices on the device auto x_vec = Buffer<T>(context, args.x_size); diff --git a/test/performance/client.hpp b/test/performance/client.hpp index 6d35fced..381ba158 100644 --- a/test/performance/client.hpp +++ b/test/performance/client.hpp @@ -40,6 +40,7 @@ namespace clblast { template <typename T, typename U> class Client { public: + static constexpr auto kSeed = 42; // fixed seed for reproducibility // Shorthand for the routine-specific functions passed to the tester using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>; diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp index 4423845e..f98bdb06 100644 --- a/test/routines/level1/xamax.hpp +++ b/test/routines/level1/xamax.hpp @@ -76,7 +76,7 @@ class TestXamax { buffers.scalar(), args.imax_offset, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp index b1f02dcd..64aa37c2 100644 --- a/test/routines/level1/xasum.hpp +++ b/test/routines/level1/xasum.hpp @@ -76,7 +76,7 @@ class TestXasum { buffers.scalar(), args.asum_offset, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp index c276a42e..b24e6fe8 100644 --- a/test/routines/level1/xaxpy.hpp +++ b/test/routines/level1/xaxpy.hpp @@ -77,7 +77,7 @@ class TestXaxpy { buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp index a96bb9ae..87bc21d4 100644 --- a/test/routines/level1/xcopy.hpp +++ b/test/routines/level1/xcopy.hpp @@ -76,7 +76,7 @@ class TestXcopy { buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp index f6cf2809..c4f6076a 100644 --- a/test/routines/level1/xdot.hpp +++ b/test/routines/level1/xdot.hpp @@ -81,7 +81,7 @@ class TestXdot { buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp index 2b00d04b..aae892a8 100644 --- a/test/routines/level1/xdotc.hpp +++ b/test/routines/level1/xdotc.hpp @@ -81,7 +81,7 @@ class TestXdotc { buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp index 31a867e0..f6be385b 100644 --- a/test/routines/level1/xdotu.hpp +++ b/test/routines/level1/xdotu.hpp @@ -81,7 +81,7 @@ class TestXdotu { buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp index 62d649e3..e604077c 100644 --- a/test/routines/level1/xnrm2.hpp +++ b/test/routines/level1/xnrm2.hpp @@ -76,7 +76,7 @@ class TestXnrm2 { buffers.scalar(), args.nrm2_offset, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp index 79926890..3c438bd6 100644 --- a/test/routines/level1/xscal.hpp +++ b/test/routines/level1/xscal.hpp @@ -72,7 +72,7 @@ class TestXscal { auto status = Scal(args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp index 8f7e4cfe..a0491f12 100644 --- a/test/routines/level1/xswap.hpp +++ b/test/routines/level1/xswap.hpp @@ -76,7 +76,7 @@ class TestXswap { buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp index 5a907077..5ed92aae 100644 --- a/test/routines/level2/xgbmv.hpp +++ b/test/routines/level2/xgbmv.hpp @@ -90,7 +90,7 @@ class TestXgbmv { buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp index 1499b2d2..9ee6d535 100644 --- a/test/routines/level2/xgemv.hpp +++ b/test/routines/level2/xgemv.hpp @@ -90,7 +90,7 @@ class TestXgemv { buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp index 5cbed505..42283107 100644 --- a/test/routines/level2/xger.hpp +++ b/test/routines/level2/xger.hpp @@ -86,7 +86,7 @@ class TestXger { buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp index d50092cb..ef69c197 100644 --- a/test/routines/level2/xgerc.hpp +++ b/test/routines/level2/xgerc.hpp @@ -86,7 +86,7 @@ class TestXgerc { buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp index 9c823b73..b2afc6d8 100644 --- a/test/routines/level2/xgeru.hpp +++ b/test/routines/level2/xgeru.hpp @@ -86,7 +86,7 @@ class TestXgeru { buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp index 01cb3f51..8bda4d0c 100644 --- a/test/routines/level2/xhbmv.hpp +++ b/test/routines/level2/xhbmv.hpp @@ -84,7 +84,7 @@ class TestXhbmv { buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp index dadd3975..80565d04 100644 --- a/test/routines/level2/xhemv.hpp +++ b/test/routines/level2/xhemv.hpp @@ -84,7 +84,7 @@ class TestXhemv { buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp index b21c0a9b..d71c8009 100644 --- a/test/routines/level2/xher.hpp +++ b/test/routines/level2/xher.hpp @@ -79,7 +79,7 @@ class TestXher { buffers.x_vec(), args.x_offset, args.x_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp index 070f823c..083dfa2f 100644 --- a/test/routines/level2/xher2.hpp +++ b/test/routines/level2/xher2.hpp @@ -84,7 +84,7 @@ class TestXher2 { buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp index d7f9634e..1dd63562 100644 --- a/test/routines/level2/xhpmv.hpp +++ b/test/routines/level2/xhpmv.hpp @@ -84,7 +84,7 @@ class TestXhpmv { buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp index 8f44a68d..a5c77811 100644 --- a/test/routines/level2/xhpr.hpp +++ b/test/routines/level2/xhpr.hpp @@ -79,7 +79,7 @@ class TestXhpr { buffers.x_vec(), args.x_offset, args.x_inc, buffers.ap_mat(), args.ap_offset, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp index 666a8dfc..d09178f0 100644 --- a/test/routines/level2/xhpr2.hpp +++ b/test/routines/level2/xhpr2.hpp @@ -84,7 +84,7 @@ class TestXhpr2 { buffers.y_vec(), args.y_offset, args.y_inc, buffers.ap_mat(), args.ap_offset, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp index fd5dd68e..8e0f8321 100644 --- a/test/routines/level2/xsbmv.hpp +++ b/test/routines/level2/xsbmv.hpp @@ -84,7 +84,7 @@ class TestXsbmv { buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp index 63286248..977f733a 100644 --- a/test/routines/level2/xspmv.hpp +++ b/test/routines/level2/xspmv.hpp @@ -84,7 +84,7 @@ class TestXspmv { buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp index f9dead53..93da4b73 100644 --- a/test/routines/level2/xspr.hpp +++ b/test/routines/level2/xspr.hpp @@ -79,7 +79,7 @@ class TestXspr { buffers.x_vec(), args.x_offset, args.x_inc, buffers.ap_mat(), args.ap_offset, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp index a2f22098..b835f2b0 100644 --- a/test/routines/level2/xspr2.hpp +++ b/test/routines/level2/xspr2.hpp @@ -84,7 +84,7 @@ class TestXspr2 { buffers.y_vec(), args.y_offset, args.y_inc, buffers.ap_mat(), args.ap_offset, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp index 0d3ca632..0ec96f1d 100644 --- a/test/routines/level2/xsymv.hpp +++ b/test/routines/level2/xsymv.hpp @@ -84,7 +84,7 @@ class TestXsymv { buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp index 15ad9595..b49132e3 100644 --- a/test/routines/level2/xsyr.hpp +++ b/test/routines/level2/xsyr.hpp @@ -79,7 +79,7 @@ class TestXsyr { buffers.x_vec(), args.x_offset, args.x_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp index a9a61a1f..7c65daa2 100644 --- a/test/routines/level2/xsyr2.hpp +++ b/test/routines/level2/xsyr2.hpp @@ -84,7 +84,7 @@ class TestXsyr2 { buffers.y_vec(), args.y_offset, args.y_inc, buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp index 54e7fe18..cf30c2f7 100644 --- a/test/routines/level2/xtbmv.hpp +++ b/test/routines/level2/xtbmv.hpp @@ -78,7 +78,7 @@ class TestXtbmv { buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp index 9776c4de..d08e132f 100644 --- a/test/routines/level2/xtpmv.hpp +++ b/test/routines/level2/xtpmv.hpp @@ -78,7 +78,7 @@ class TestXtpmv { buffers.ap_mat(), args.ap_offset, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp index 18300e50..cf9a0063 100644 --- a/test/routines/level2/xtrmv.hpp +++ b/test/routines/level2/xtrmv.hpp @@ -78,7 +78,7 @@ class TestXtrmv { buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp index 5f9bea81..bca3c049 100644 --- a/test/routines/level3/xgemm.hpp +++ b/test/routines/level3/xgemm.hpp @@ -92,7 +92,7 @@ class TestXgemm { buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp index 8c44be25..31c7695f 100644 --- a/test/routines/level3/xhemm.hpp +++ b/test/routines/level3/xhemm.hpp @@ -92,7 +92,7 @@ class TestXhemm { buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp index fd20bbb5..ff2bb6cb 100644 --- a/test/routines/level3/xher2k.hpp +++ b/test/routines/level3/xher2k.hpp @@ -91,7 +91,7 @@ class TestXher2k { buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp index 12990d39..26396fa9 100644 --- a/test/routines/level3/xherk.hpp +++ b/test/routines/level3/xherk.hpp @@ -82,7 +82,7 @@ class TestXherk { buffers.a_mat(), args.a_offset, args.a_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp index f8e90927..c84c22b4 100644 --- a/test/routines/level3/xsymm.hpp +++ b/test/routines/level3/xsymm.hpp @@ -92,7 +92,7 @@ class TestXsymm { buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp index 4e4ba0b7..5c4976e2 100644 --- a/test/routines/level3/xsyr2k.hpp +++ b/test/routines/level3/xsyr2k.hpp @@ -90,7 +90,7 @@ class TestXsyr2k { buffers.b_mat(), args.b_offset, args.b_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp index f5509c88..98c4f6a4 100644 --- a/test/routines/level3/xsyrk.hpp +++ b/test/routines/level3/xsyrk.hpp @@ -82,7 +82,7 @@ class TestXsyrk { buffers.a_mat(), args.a_offset, args.a_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp index 45e17e45..55b51e54 100644 --- a/test/routines/level3/xtrmm.hpp +++ b/test/routines/level3/xtrmm.hpp @@ -82,7 +82,7 @@ class TestXtrmm { buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp index 4637c07e..dccb3583 100644 --- a/test/routines/levelx/xomatcopy.hpp +++ b/test/routines/levelx/xomatcopy.hpp @@ -77,7 +77,7 @@ class TestXomatcopy { buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, &queue_plain, &event); - clWaitForEvents(1, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; } |