summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcnugteren <web@cedricnugteren.nl>2016-04-03 16:08:48 -0700
committercnugteren <web@cedricnugteren.nl>2016-04-03 16:08:48 -0700
commit2981ca4d3caf5b1fcc346bc6df65f2ee6ccac141 (patch)
tree6ec690bcb9fd616af93821445887cf8c73cd4a2e
parent8217b017028412594f663a66187f99c3ee0878c9 (diff)
parentc4ab9bda6321aab66e05fd3d00e7b58443c640ef (diff)
Merge branch 'cpu_blas' into development
-rw-r--r--CHANGELOG1
-rw-r--r--CMakeLists.txt42
-rw-r--r--README.md13
-rw-r--r--cmake/Modules/FindCBLAS.cmake75
-rw-r--r--include/clblast.h2
-rw-r--r--include/clblast_c.h4
-rw-r--r--include/internal/clpp11.h14
-rw-r--r--include/internal/utilities.h5
-rw-r--r--scripts/generator/datatype.py5
-rw-r--r--scripts/generator/generator.py53
-rw-r--r--scripts/generator/routine.py109
-rw-r--r--src/clblast.cc6
-rw-r--r--src/clblast_c.cc4
-rw-r--r--test/correctness/testblas.cc45
-rw-r--r--test/correctness/testblas.h47
-rw-r--r--test/correctness/tester.cc16
-rw-r--r--test/correctness/tester.h6
-rw-r--r--test/performance/client.cc40
-rw-r--r--test/performance/client.h33
-rw-r--r--test/routines/level1/xaxpy.h46
-rw-r--r--test/routines/level1/xcopy.h46
-rw-r--r--test/routines/level1/xdot.h51
-rw-r--r--test/routines/level1/xdotc.h51
-rw-r--r--test/routines/level1/xdotu.h51
-rw-r--r--test/routines/level1/xnrm2.h46
-rw-r--r--test/routines/level1/xscal.h41
-rw-r--r--test/routines/level1/xswap.h47
-rw-r--r--test/routines/level2/xgbmv.h57
-rw-r--r--test/routines/level2/xgemv.h57
-rw-r--r--test/routines/level2/xger.h54
-rw-r--r--test/routines/level2/xgerc.h54
-rw-r--r--test/routines/level2/xgeru.h54
-rw-r--r--test/routines/level2/xhbmv.h57
-rw-r--r--test/routines/level2/xhemv.h57
-rw-r--r--test/routines/level2/xher.h52
-rw-r--r--test/routines/level2/xher2.h57
-rw-r--r--test/routines/level2/xhpmv.h57
-rw-r--r--test/routines/level2/xhpr.h52
-rw-r--r--test/routines/level2/xhpr2.h57
-rw-r--r--test/routines/level2/xsbmv.h57
-rw-r--r--test/routines/level2/xspmv.h57
-rw-r--r--test/routines/level2/xspr.h52
-rw-r--r--test/routines/level2/xspr2.h57
-rw-r--r--test/routines/level2/xsymv.h57
-rw-r--r--test/routines/level2/xsyr.h52
-rw-r--r--test/routines/level2/xsyr2.h57
-rw-r--r--test/routines/level2/xtbmv.h58
-rw-r--r--test/routines/level2/xtpmv.h58
-rw-r--r--test/routines/level2/xtrmv.h58
-rw-r--r--test/routines/level3/xgemm.h60
-rw-r--r--test/routines/level3/xhemm.h60
-rw-r--r--test/routines/level3/xher2k.h63
-rw-r--r--test/routines/level3/xherk.h55
-rw-r--r--test/routines/level3/xsymm.h60
-rw-r--r--test/routines/level3/xsyr2k.h60
-rw-r--r--test/routines/level3/xsyrk.h55
-rw-r--r--test/routines/level3/xtrmm.h61
-rw-r--r--test/wrapper_cblas.h1674
-rw-r--r--test/wrapper_clblas.h6
59 files changed, 3624 insertions, 657 deletions
diff --git a/CHANGELOG b/CHANGELOG
index c52e041d..db14f037 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,7 @@
Development version (next release)
- Added exports to be able to create a DLL on Windows (thanks to Marco Hutter)
- Made the library thread-safe
+- Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries
- Fixed the use of events within the library
- Added level-1 routines:
* SNRM2/DNRM2/ScNRM2/DzNRM2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8316a49a..21254ded 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,7 +66,7 @@ else ()
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
- set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
+ set(FLAGS "${FLAGS} -Wall -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
endif()
@@ -98,11 +98,13 @@ if(TUNERS)
endif()
endif()
-# Locates the clBLAS library in case the tests need to be compiled. "FindclBLAS.cmake" is included.
+# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake"
+# and "FindCBLAS.cmake" are included.
if(TESTS)
find_package(clBLAS)
- if(NOT CLBLAS_FOUND)
- message(STATUS "Could NOT find clBLAS, disabling the compilation of the tests")
+ find_package(CBLAS)
+ if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
+ message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
set(TESTS OFF)
endif()
endif()
@@ -215,11 +217,33 @@ endif()
# ==================================================================================================
# Down from here is all test (performance and correctness) related. Note that these tests require
-# the presence of the clBLAS library to act as a reference.
+# the presence of clBLAS and/or a BLAS library to act as a reference.
if(TESTS)
- # Adds new include directories for the reference clBLAS
- include_directories(${clblast_SOURCE_DIR}/test ${CLBLAS_INCLUDE_DIRS})
+ # Sets the specifics for the reference BLAS libraries
+ set(REF_INCLUDES )
+ set(REF_LIBRARIES )
+ if(CLBLAS_FOUND)
+ set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
+ set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
+ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+ add_definitions(" /DCLBLAST_REF_CLBLAS")
+ else()
+ add_definitions(" -DCLBLAST_REF_CLBLAS")
+ endif()
+ endif()
+ if(CBLAS_FOUND)
+ set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
+ set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
+ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+ add_definitions(" /DCLBLAST_REF_CBLAS")
+ else()
+ add_definitions(" -DCLBLAST_REF_CBLAS")
+ endif()
+ endif()
+
+ # Sets the include directories
+ include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES})
# Creates the common correctness-tests objects (requires CMake 2.8.8)
add_library(test_correctness_common OBJECT
@@ -239,7 +263,7 @@ if(TESTS)
test/correctness/routines/level3/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES})
- target_link_libraries(clblast_test_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
+ target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
endforeach()
@@ -269,7 +293,7 @@ if(TESTS)
test/performance/routines/level3/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES})
- target_link_libraries(clblast_client_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
+ target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
endforeach()
diff --git a/README.md b/README.md
index ac614026..d69ad552 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,14 @@ The pre-requisites for compilation of CLBlast are:
- Intel OpenCL
- Beignet
+Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either:
+
+* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS (maintained by AMD)
+* A regular CPU Netlib BLAS library, e.g.:
+ - OpenBLAS
+ - BLIS
+ - Accelerate
+
An example of an out-of-source build (starting from the root of the CLBlast folder):
mkdir build
@@ -135,9 +143,9 @@ To make sure CLBlast is working correctly on your device (recommended), compile
cmake -DTESTS=ON ..
-Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. The library clBLAS is therefore required to be installed on your system for the CLBlast tests.
+Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is best tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. If the library clBLAS is not installed on your system, it will use a regular CPU BLAS library to test against.
-With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run both CLBlast and clBLAS in a head-to-head performance test.
+With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against clBLAS and/or a CPU BLAS library.
Performance remarks
@@ -249,4 +257,3 @@ To-do list before release of version 1.0
- Support all routines supported by clBLAS
- Allow the user control over events and synchronization
- Add half-precision routines (e.g. HGEMM)
-- Enable correctness and performance testing against a CPU-based BLAS library
diff --git a/cmake/Modules/FindCBLAS.cmake b/cmake/Modules/FindCBLAS.cmake
new file mode 100644
index 00000000..86f14515
--- /dev/null
+++ b/cmake/Modules/FindCBLAS.cmake
@@ -0,0 +1,75 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+# width of 100 characters per line.
+#
+# Author(s):
+# Cedric Nugteren <www.cedricnugteren.nl>
+#
+# ==================================================================================================
+#
+# Defines the following variables:
+# CBLAS_FOUND Boolean holding whether or not the Netlib BLAS library was found
+# CBLAS_INCLUDE_DIRS The Netlib BLAS include directory
+# CBLAS_LIBRARIES The Netlib BLAS library
+#
+# In case BLAS is not installed in the default directory, set the CBLAS_ROOT variable to point to
+# the root of BLAS, such that 'cblas.h' can be found in $CBLAS_ROOT/include. This can either be
+# done using an environmental variable (e.g. export CBLAS_ROOT=/path/to/BLAS) or using a CMake
+# variable (e.g. cmake -DCBLAS_ROOT=/path/to/BLAS ..).
+#
+# ==================================================================================================
+
+# Sets the possible install locations
+set(CBLAS_HINTS
+ ${CBLAS_ROOT}
+ $ENV{CBLAS_ROOT}
+)
+set(CBLAS_PATHS
+ /usr
+ /usr/local
+ /usr/local/opt
+ /System/Library/Frameworks
+)
+
+# Finds the include directories
+find_path(CBLAS_INCLUDE_DIRS
+ NAMES cblas.h
+ HINTS ${CBLAS_HINTS}
+ PATH_SUFFIXES
+ include inc include/x86_64 include/x64
+ openblas/include include/blis blis/include blis/include/blis
+ Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers
+ PATHS ${CBLAS_PATHS}
+ DOC "Netlib BLAS include header cblas.h"
+)
+mark_as_advanced(CBLAS_INCLUDE_DIRS)
+
+# Finds the library
+find_library(CBLAS_LIBRARIES
+ NAMES blas mkl blis openblas atlas accelerate
+ HINTS ${CBLAS_HINTS}
+ PATH_SUFFIXES
+ lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
+ openblas/lib blis/lib
+ PATHS ${CBLAS_PATHS}
+ DOC "Netlib BLAS library"
+)
+mark_as_advanced(CBLAS_LIBRARIES)
+
+# ==================================================================================================
+
+# Notification messages
+if(NOT CBLAS_INCLUDE_DIRS)
+ message(STATUS "Could NOT find 'cblas.h', install a CPU Netlib BLAS or set CBLAS_ROOT")
+endif()
+if(NOT CBLAS_LIBRARIES)
+ message(STATUS "Could NOT find a CPU Netlib BLAS library, install it or set CBLAS_ROOT")
+endif()
+
+# Determines whether or not BLAS was found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIRS CBLAS_LIBRARIES)
+
+# ==================================================================================================
diff --git a/include/clblast.h b/include/clblast.h
index 5e5c5a46..431f2510 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -100,7 +100,7 @@ template <typename T>
StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
- cl_mem sy1_buffer, const size_t sy1_offset,
+ const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event = nullptr);
diff --git a/include/clblast_c.h b/include/clblast_c.h
index dcb3ae3a..f72cff3a 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -112,13 +112,13 @@ StatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
StatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
- cl_mem sy1_buffer, const size_t sy1_offset,
+ const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
- cl_mem sy1_buffer, const size_t sy1_offset,
+ const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event);
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
index aac66396..00905ef7 100644
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
@@ -465,31 +465,33 @@ class Buffer {
}
// Copies from device to host: reading the device buffer a-synchronously
- void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
+ void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
- const size_t offset = 0) {
+ const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
ReadAsync(queue, size, host.data(), offset);
}
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
- const size_t offset = 0) {
+ const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
ReadAsync(queue, size, host.data(), offset);
}
// Copies from device to host: reading the device buffer
- void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
+ void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
ReadAsync(queue, size, host, offset);
queue.Finish();
}
- void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
+ void Read(const Queue &queue, const size_t size, std::vector<T> &host,
+ const size_t offset = 0) const {
Read(queue, size, host.data(), offset);
}
- void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
+ void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
+ const size_t offset = 0) const {
Read(queue, size, host.data(), offset);
}
diff --git a/include/internal/utilities.h b/include/internal/utilities.h
index 35f76722..6adc1d0a 100644
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@@ -35,6 +35,9 @@ using double2 = std::complex<double>;
const std::string kKhronosHalfPrecision = "cl_khr_fp16";
const std::string kKhronosDoublePrecision = "cl_khr_fp64";
+// Catched an unknown error
+constexpr auto kUnknownError = -999;
+
// =================================================================================================
// The routine-specific arguments in string form
@@ -70,6 +73,7 @@ constexpr auto kArgFraction = "fraction";
// The client-specific arguments in string form
constexpr auto kArgCompareclblas = "clblas";
+constexpr auto kArgComparecblas = "cblas";
constexpr auto kArgStepSize = "step";
constexpr auto kArgNumSteps = "num_steps";
constexpr auto kArgNumRuns = "runs";
@@ -128,6 +132,7 @@ struct Arguments {
double fraction = 1.0;
// Client-specific arguments
int compare_clblas = 1;
+ int compare_cblas = 1;
size_t step = 1;
size_t num_steps = 0;
size_t num_runs = 10;
diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py
index 9323bc4d..5a58ab53 100644
--- a/scripts/generator/datatype.py
+++ b/scripts/generator/datatype.py
@@ -58,5 +58,10 @@ class DataType():
return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp
return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp
+ # Current scalar is complex
+ def IsComplex(self, scalar):
+ return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or
+ (scalar == "beta" and self.beta_cpp in [FLT2, DBL2]))
+
# ==================================================================================================
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 6e2b2ed2..bdf6b9d7 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -8,12 +8,13 @@
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This script automatically generates the bodies of the following files, creating the full CLBlast
-# API interface and implementation (C, C++, and clBLAS wrapper):
+# API interface and implementation (C, C++, and reference BLAS wrappers):
# clblast.h
# clblast.cc
# clblast_c.h
# clblast_c.cc
# wrapper_clblas.h
+# wrapper_cblas.h
# It also generates the main functions for the correctness and performance tests as found in
# test/correctness/routines/levelX/xYYYY.cc
# test/performance/routines/levelX/xYYYY.cc
@@ -55,7 +56,7 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T")
routines = [
[ # Level 1: vector-vector
Routine(False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation"),
- Routine(False, "1", "rotmg", T, [S,D], [], [], [], ["sd1","sd2","sx1","sy1","sparam"], [], "", "Generate modified givens plane rotation"),
+ Routine(False, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation"),
Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation"),
Routine(False, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation"),
Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors"),
@@ -220,11 +221,11 @@ def wrapper_clblas(routines):
for routine in routines:
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
if routine.NoScalars():
- result += routine.RoutineHeaderWrapper(routine.template, True, 21)+";\n"
+ result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n"
for flavour in routine.flavours:
indent = " "*(17 + routine.Length())
- result += routine.RoutineHeaderWrapper(flavour, False, 21)+" {\n"
- arguments = routine.ArgumentsWrapper(flavour)
+ result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n"
+ arguments = routine.ArgumentsWrapperCL(flavour)
if routine.scratch:
result += " auto queue = Queue(queues[0]);\n"
result += " auto context = queue.GetContext();\n"
@@ -236,6 +237,41 @@ def wrapper_clblas(routines):
result += "\n}\n"
return result
+# The wrapper to the reference CBLAS routines (for performance/correctness testing)
+def wrapper_cblas(routines):
+ result = ""
+ for routine in routines:
+ result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames())
+ for flavour in routine.flavours:
+ indent = " "*(10 + routine.Length())
+ result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n"
+ arguments = routine.ArgumentsWrapperC(flavour)
+
+ # Double-precision scalars
+ for scalar in routine.scalars:
+ if flavour.IsComplex(scalar):
+ result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
+
+ # Special case for scalar outputs
+ assignment = ""
+ postfix = ""
+ extra_argument = ""
+ for output_buffer in routine.outputs:
+ if output_buffer in routine.ScalarBuffersFirst():
+ if flavour in [C,Z]:
+ postfix += "_sub"
+ indent += " "
+ extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
+ else:
+ assignment = output_buffer+"_buffer["+output_buffer+"_offset] = "
+ indent += " "*len(assignment)
+
+ result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
+ result += (",\n"+indent).join([a for a in arguments])
+ result += extra_argument+");"
+ result += "\n}\n"
+ return result
+
# ==================================================================================================
# Checks for the number of command-line arguments
@@ -251,9 +287,10 @@ files = [
path_clblast+"/include/clblast_c.h",
path_clblast+"/src/clblast_c.cc",
path_clblast+"/test/wrapper_clblas.h",
+ path_clblast+"/test/wrapper_cblas.h",
]
-header_lines = [84, 65, 93, 22, 22]
-footer_lines = [6, 3, 9, 2, 6]
+header_lines = [84, 65, 93, 22, 22, 38]
+footer_lines = [6, 3, 9, 2, 6, 6]
# Checks whether the command-line arguments are valid; exists otherwise
for f in files:
@@ -287,6 +324,8 @@ for i in xrange(0,len(files)):
body += clblast_c_cc(routines[level-1])
if i == 4:
body += wrapper_clblas(routines[level-1])
+ if i == 5:
+ body += wrapper_cblas(routines[level-1])
f.write("".join(file_header))
f.write(body)
f.write("".join(file_footer))
diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py
index 02040583..fffa19f6 100644
--- a/scripts/generator/routine.py
+++ b/scripts/generator/routine.py
@@ -28,7 +28,7 @@ def OptionToCLBlast(x):
}[x]
# As above, but for clBLAS data-types
-def OptionToWrapper(x):
+def OptionToWrapperCL(x):
return {
'layout': "clblasOrder",
'a_transpose': "clblasTranspose",
@@ -39,6 +39,18 @@ def OptionToWrapper(x):
'diagonal': "clblasDiag",
}[x]
+# As above, but for CBLAS data-types
+def OptionToWrapperC(x):
+ return {
+ 'layout': "CBLAS_ORDER",
+ 'a_transpose': "CBLAS_TRANSPOSE",
+ 'b_transpose': "CBLAS_TRANSPOSE",
+ 'ab_transpose': "CBLAS_TRANSPOSE",
+ 'side': "CBLAS_SIDE",
+ 'triangle': "CBLAS_UPLO",
+ 'diagonal': "CBLAS_DIAG",
+ }[x]
+
# ==================================================================================================
# Class holding routine-specific information (e.g. name, which arguments, which precisions)
@@ -119,6 +131,16 @@ class Routine():
return [", ".join(a+b+c)]
return []
+ # As above but as vectors
+ def BufferDefVector(self, name, flavour):
+ prefix = "const " if (name in self.inputs) else ""
+ if (name in self.inputs) or (name in self.outputs):
+ a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"]
+ b = ["const size_t "+name+"_offset"]
+ c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
+ return [", ".join(a+b+c)]
+ return []
+
# As above but with Claduc buffers
def BufferCladuc(self, name):
if (name in self.inputs) or (name in self.outputs):
@@ -129,7 +151,7 @@ class Routine():
return []
# As above but with a static cast for clBLAS wrapper
- def BufferWrapper(self, name):
+ def BufferWrapperCL(self, name):
if (name in self.inputs) or (name in self.outputs):
a = [name+"_buffer"]
b = [name+"_offset"]
@@ -141,6 +163,24 @@ class Routine():
return [", ".join(a+b+c)]
return []
+ # As above but with a static cast for CBLAS wrapper
+ def BufferWrapperC(self, name, flavour):
+ prefix = "const " if (name in self.inputs) else ""
+ if (name in self.inputs) or (name in self.outputs):
+ if name == "sy1":
+ a = [name+"_buffer["+name+"_offset]"]
+ elif flavour.precision_name in ["C","Z"]:
+ a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"]
+ else:
+ a = ["&"+name+"_buffer["+name+"_offset]"]
+ c = []
+ if (name in ["x","y"]):
+ c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"]
+ elif (name in ["a","b","c"]):
+ c = [name+"_"+self.Postfix(name)]
+ return [", ".join(a+c)]
+ return []
+
# As above, but only data-types
def BufferType(self, name):
prefix = "const " if (name in self.inputs) else ""
@@ -179,6 +219,14 @@ class Routine():
return [name]
return []
+ # Retrieves the use of a scalar for CBLAS (alpha/beta)
+ def ScalarUseWrapperC(self, name, flavour):
+ if name in self.scalars:
+ if flavour.IsComplex(name):
+ return [name+"_array.data()"]
+ return [name]
+ return []
+
# Retrieves the definition of a scalar (alpha/beta)
def ScalarDef(self, name, flavour):
if name in self.scalars:
@@ -246,9 +294,16 @@ class Routine():
return []
# As above, but now using clBLAS data-types
- def OptionsDefWrapper(self):
+ def OptionsDefWrapperCL(self):
+ if self.options:
+ definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options]
+ return [", ".join(definitions)]
+ return []
+
+ # As above, but now using CBLAS data-types
+ def OptionsDefWrapperC(self):
if self.options:
- definitions = ["const "+OptionToWrapper(o)+" "+o for o in self.options]
+ definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options]
return [", ".join(definitions)]
return []
@@ -284,16 +339,26 @@ class Routine():
list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()])))
# As above, but for the clBLAS wrapper
- def ArgumentsWrapper(self, flavour):
+ def ArgumentsWrapperCL(self, flavour):
return (self.Options() + self.Sizes() +
- list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersFirst()])) +
+ list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarUseWrapper("alpha", flavour) +
- list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) +
+ list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) +
self.ScalarUseWrapper("beta", flavour) +
- list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) +
- list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersSecond()])) +
+ list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) +
+ list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()])))
+ # As above, but for the CBLAS wrapper
+ def ArgumentsWrapperC(self, flavour):
+ return (self.Options() + self.Sizes() +
+ self.ScalarUseWrapperC("alpha", flavour) +
+ list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) +
+ self.ScalarUseWrapperC("beta", flavour) +
+ list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) +
+ list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) +
+ list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()])))
+
# Retrieves a combination of all the argument definitions
def ArgumentsDef(self, flavour):
return (self.OptionsDef() + self.SizesDef() +
@@ -306,8 +371,8 @@ class Routine():
list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()])))
# As above, but clBLAS wrapper plain datatypes
- def ArgumentsDefWrapper(self, flavour):
- return (self.OptionsDefWrapper() + self.SizesDef() +
+ def ArgumentsDefWrapperCL(self, flavour):
+ return (self.OptionsDefWrapperCL() + self.SizesDef() +
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarDefPlain("alpha", flavour) +
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
@@ -315,6 +380,17 @@ class Routine():
list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
+
+ # As above, but CBLAS wrapper plain datatypes
+ def ArgumentsDefWrapperC(self, flavour):
+ return (self.OptionsDefWrapperC() + self.SizesDef() +
+ list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) +
+ self.ScalarDefPlain("alpha", flavour) +
+ list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) +
+ self.ScalarDefPlain("beta", flavour) +
+ list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) +
+ list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) +
+ list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument types
def ArgumentsType(self, flavour):
@@ -356,7 +432,7 @@ class Routine():
return result
# As above, but now for the clBLAS wrapper
- def RoutineHeaderWrapper(self, flavour, def_only, spaces):
+ def RoutineHeaderWrapperCL(self, flavour, def_only, spaces):
template = "<"+flavour.template+">" if self.NoScalars() and not def_only else ""
indent = " "*(spaces + self.Length() + len(template))
result = ""
@@ -366,9 +442,16 @@ class Routine():
result += flavour.name
result += ">\n"
result += "clblasStatus clblasX"+self.name+template+"("
- result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapper(flavour)])
+ result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)])
result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues"
result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
return result
+ # As above, but now for the CBLAS wrapper
+ def RoutineHeaderWrapperC(self, flavour, def_only, spaces):
+ indent = " "*(spaces + self.Length())
+ result = "void cblasX"+self.name+"("
+ result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")"
+ return result
+
# ==================================================================================================
diff --git a/src/clblast.cc b/src/clblast.cc
index fc50ffae..75893ee9 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -93,7 +93,7 @@ template <typename T>
StatusCode Rotmg(cl_mem, const size_t,
cl_mem, const size_t,
cl_mem, const size_t,
- cl_mem, const size_t,
+ const cl_mem, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
@@ -101,13 +101,13 @@ StatusCode Rotmg(cl_mem, const size_t,
template StatusCode PUBLIC_API Rotmg<float>(cl_mem, const size_t,
cl_mem, const size_t,
cl_mem, const size_t,
- cl_mem, const size_t,
+ const cl_mem, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Rotmg<double>(cl_mem, const size_t,
cl_mem, const size_t,
cl_mem, const size_t,
- cl_mem, const size_t,
+ const cl_mem, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
diff --git a/src/clblast_c.cc b/src/clblast_c.cc
index 6d10c686..23e97bd5 100644
--- a/src/clblast_c.cc
+++ b/src/clblast_c.cc
@@ -55,7 +55,7 @@ StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
- cl_mem sy1_buffer, const size_t sy1_offset,
+ const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Rotmg<float>(sd1_buffer, sd1_offset,
@@ -69,7 +69,7 @@ StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
- cl_mem sy1_buffer, const size_t sy1_offset,
+ const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Rotmg<double>(sd1_buffer, sd1_offset,
diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc
index 1329b2c5..cc9a5adb 100644
--- a/test/correctness/testblas.cc
+++ b/test/correctness/testblas.cc
@@ -79,24 +79,6 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
// Iterates over all the to-be-tested combinations of arguments
for (auto &args: test_vector) {
- // Runs the reference clBLAS code
- auto x_vec1 = Buffer<T>(context_, args.x_size);
- auto y_vec1 = Buffer<T>(context_, args.y_size);
- auto a_mat1 = Buffer<T>(context_, args.a_size);
- auto b_mat1 = Buffer<T>(context_, args.b_size);
- auto c_mat1 = Buffer<T>(context_, args.c_size);
- auto ap_mat1 = Buffer<T>(context_, args.ap_size);
- auto scalar1 = Buffer<T>(context_, args.scalar_size);
- x_vec1.Write(queue_, args.x_size, x_source_);
- y_vec1.Write(queue_, args.y_size, y_source_);
- a_mat1.Write(queue_, args.a_size, a_source_);
- b_mat1.Write(queue_, args.b_size, b_source_);
- c_mat1.Write(queue_, args.c_size, c_source_);
- ap_mat1.Write(queue_, args.ap_size, ap_source_);
- scalar1.Write(queue_, args.scalar_size, scalar_source_);
- auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
- auto status1 = run_reference_(args, buffers1, queue_);
-
// Runs the CLBlast code
auto x_vec2 = Buffer<T>(context_, args.x_size);
auto y_vec2 = Buffer<T>(context_, args.y_size);
@@ -115,6 +97,33 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
auto status2 = run_routine_(args, buffers2, queue_);
+ #ifndef CLBLAST_REF_CLBLAS
+ // Don't continue with CBLAS if there are incorrect parameters
+ if (status2 != StatusCode::kSuccess) {
+ // TODO: Mark this as a skipped test instead of a succesfull test
+ TestErrorCodes(status2, status2, args);
+ continue;
+ }
+ #endif
+
+ // Runs the reference BLAS code
+ auto x_vec1 = Buffer<T>(context_, args.x_size);
+ auto y_vec1 = Buffer<T>(context_, args.y_size);
+ auto a_mat1 = Buffer<T>(context_, args.a_size);
+ auto b_mat1 = Buffer<T>(context_, args.b_size);
+ auto c_mat1 = Buffer<T>(context_, args.c_size);
+ auto ap_mat1 = Buffer<T>(context_, args.ap_size);
+ auto scalar1 = Buffer<T>(context_, args.scalar_size);
+ x_vec1.Write(queue_, args.x_size, x_source_);
+ y_vec1.Write(queue_, args.y_size, y_source_);
+ a_mat1.Write(queue_, args.a_size, a_source_);
+ b_mat1.Write(queue_, args.b_size, b_source_);
+ c_mat1.Write(queue_, args.c_size, c_source_);
+ ap_mat1.Write(queue_, args.ap_size, ap_source_);
+ scalar1.Write(queue_, args.scalar_size, scalar_source_);
+ auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
+ auto status1 = run_reference_(args, buffers1, queue_);
+
// Tests for equality of the two status codes
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
TestErrorCodes(status1, status2, args);
diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h
index 7c9032bd..8181aaf6 100644
--- a/test/correctness/testblas.h
+++ b/test/correctness/testblas.h
@@ -68,7 +68,7 @@ class TestBlas: public Tester<T,U> {
static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
// Shorthand for the routine-specific functions passed to the tester
- using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers<T>&, Queue&)>;
+ using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers<T>&, Queue&)>;
using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
using ResultIterator = std::function<size_t(const Arguments<U>&)>;
@@ -76,8 +76,9 @@ class TestBlas: public Tester<T,U> {
// Constructor, initializes the base class tester and input data
TestBlas(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
- const Routine run_routine, const Routine run_reference, const ResultGet get_result,
- const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2);
+ const Routine run_routine, const Routine run_reference,
+ const ResultGet get_result, const ResultIndex get_index,
+ const ResultIterator get_id1, const ResultIterator get_id2);
// The test functions, taking no inputs
void TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name);
@@ -110,9 +111,17 @@ class TestBlas: public Tester<T,U> {
template <typename C, typename T, typename U>
void RunTests(int argc, char *argv[], const bool silent, const std::string &name) {
+ // Sets the reference to test against
+ #ifdef CLBLAST_REF_CLBLAS
+ const auto reference_routine = C::RunReference1; // clBLAS when available
+ #else
+ const auto reference_routine = C::RunReference2; // otherwise CBLAS
+ #endif
+
// Creates a tester
auto options = C::GetOptions();
- TestBlas<T,U> tester{argc, argv, silent, name, options, C::RunRoutine, C::RunReference,
+ TestBlas<T,U> tester{argc, argv, silent, name, options,
+ C::RunRoutine, reference_routine,
C::DownloadResult, C::GetResultIndex, C::ResultID1, C::ResultID2};
// This variable holds the arguments relevant for this routine
@@ -250,23 +259,25 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
}
// Creates the arguments vector for the invalid-buffer tests
- auto invalid_test_vector = std::vector<Arguments<U>>{};
- auto i_args = args;
- i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
- i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize;
- for (auto &x_size: x_sizes) { i_args.x_size = x_size;
- for (auto &y_size: y_sizes) { i_args.y_size = y_size;
- for (auto &a_size: a_sizes) { i_args.a_size = a_size;
- for (auto &b_size: b_sizes) { i_args.b_size = b_size;
- for (auto &c_size: c_sizes) { i_args.c_size = c_size;
- for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size;
- invalid_test_vector.push_back(i_args);
+ #ifdef CLBLAST_REF_CLBLAS
+ auto invalid_test_vector = std::vector<Arguments<U>>{};
+ auto i_args = args;
+ i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
+ i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize;
+ for (auto &x_size: x_sizes) { i_args.x_size = x_size;
+ for (auto &y_size: y_sizes) { i_args.y_size = y_size;
+ for (auto &a_size: a_sizes) { i_args.a_size = a_size;
+ for (auto &b_size: b_sizes) { i_args.b_size = b_size;
+ for (auto &c_size: c_sizes) { i_args.c_size = c_size;
+ for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size;
+ invalid_test_vector.push_back(i_args);
+ }
}
}
}
}
}
- }
+ #endif
// Sets the name of this test-case
auto names = std::vector<std::string>{};
@@ -287,7 +298,9 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
// Runs the tests
tester.TestRegular(regular_test_vector, case_name);
- tester.TestInvalid(invalid_test_vector, case_name);
+ #ifdef CLBLAST_REF_CLBLAS
+ tester.TestInvalid(invalid_test_vector, case_name);
+ #endif
}
}
}
diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc
index 8169f700..872a131a 100644
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@@ -69,10 +69,12 @@ Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
kUnsupportedPrecision.c_str());
// Initializes clBLAS
- auto status = clblasSetup();
- if (status != CL_SUCCESS) {
- throw std::runtime_error("clBLAS setup error: "+ToString(static_cast<int>(status)));
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ auto status = clblasSetup();
+ if (status != CL_SUCCESS) {
+ throw std::runtime_error("clBLAS setup error: "+ToString(static_cast<int>(status)));
+ }
+ #endif
}
// Destructor prints the summary of the test cases and cleans-up the clBLAS library
@@ -87,7 +89,11 @@ Tester<T,U>::~Tester() {
fprintf(stdout, " %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
}
fprintf(stdout, "\n");
- clblasTeardown();
+
+ // Cleans-up clBLAS
+ #ifdef CLBLAST_REF_CLBLAS
+ clblasTeardown();
+ #endif
}
// =================================================================================================
diff --git a/test/correctness/tester.h b/test/correctness/tester.h
index db714f3d..d489f829 100644
--- a/test/correctness/tester.h
+++ b/test/correctness/tester.h
@@ -23,7 +23,9 @@
#include <memory>
// The libraries
-#include <clBLAS.h>
+#ifdef CLBLAST_REF_CLBLAS
+ #include <clBLAS.h>
+#endif
#include "clblast.h"
#include "internal/utilities.h"
@@ -92,7 +94,7 @@ class Tester {
Queue queue_;
// Whether or not to run the full test-suite or just a smoke test
- bool full_test_;
+ const bool full_test_;
// Retrieves the offset values to test with
const std::vector<size_t> GetOffsets() const;
diff --git a/test/performance/client.cc b/test/performance/client.cc
index 17f54231..56ab8c8d 100644
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@@ -24,11 +24,13 @@ namespace clblast {
// Constructor
template <typename T, typename U>
-Client<T,U>::Client(const Routine run_routine, const Routine run_reference,
+Client<T,U>::Client(const Routine run_routine,
+ const Routine run_reference1, const Routine run_reference2,
const std::vector<std::string> &options,
const GetMetric get_flops, const GetMetric get_bytes):
run_routine_(run_routine),
- run_reference_(run_reference),
+ run_reference1_(run_reference1),
+ run_reference2_(run_reference2),
options_(options),
get_flops_(get_flops),
get_bytes_(get_bytes) {
@@ -90,7 +92,16 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
- args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1);
+ #ifdef CLBLAST_REF_CLBLAS
+ args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1);
+ #else
+ args.compare_clblas = 0;
+ #endif
+ #ifdef CLBLAST_REF_CBLAS
+ args.compare_cblas = GetArgument(argc, argv, help, kArgComparecblas, 1);
+ #else
+ args.compare_cblas = 0;
+ #endif
args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1});
args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0});
args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10});
@@ -120,7 +131,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
auto device = Device(platform, args.device_id);
auto context = Context(device);
auto queue = Queue(context, device);
- if (args.compare_clblas) { clblasSetup(); }
+ #ifdef CLBLAST_REF_CLBLAS
+ if (args.compare_clblas) { clblasSetup(); }
+ #endif
// Iterates over all "num_step" values jumping by "step" each time
auto s = size_t{0};
@@ -167,9 +180,13 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
if (args.compare_clblas) {
- auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
+ auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS");
timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
}
+ if (args.compare_cblas) {
+ auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS");
+ timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas));
+ }
// Prints the performance of the tested libraries
PrintTableRow(args, timings);
@@ -186,7 +203,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
}
// Cleans-up and returns
- if (args.compare_clblas) { clblasTeardown(); }
+ #ifdef CLBLAST_REF_CLBLAS
+ if (args.compare_clblas) { clblasTeardown(); }
+ #endif
}
// =================================================================================================
@@ -196,14 +215,17 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
// value found in the vector of timing results. The return value is in milliseconds.
template <typename T, typename U>
double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
- const Buffers<T> &buffers, Queue &queue,
+ Buffers<T> &buffers, Queue &queue,
Routine run_blas, const std::string &library_name) {
auto timings = std::vector<double>(num_runs);
for (auto &timing: timings) {
auto start_time = std::chrono::steady_clock::now();
// Executes the main computation
- auto status = run_blas(args, buffers, queue);
+ auto status = StatusCode::kSuccess;
+ try {
+ status = run_blas(args, buffers, queue);
+ } catch (...) { status = static_cast<StatusCode>(kUnknownError); }
if (status != StatusCode::kSuccess) {
throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
}
@@ -226,6 +248,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); }
fprintf(stdout, " | <-- CLBlast -->");
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
+ if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); }
fprintf(stdout, " |\n");
}
@@ -233,6 +256,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); }
fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
+ if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); }
fprintf(stdout, "\n");
}
diff --git a/test/performance/client.h b/test/performance/client.h
index 5805b8a5..8d0597d7 100644
--- a/test/performance/client.h
+++ b/test/performance/client.h
@@ -26,7 +26,9 @@
#include <utility>
// The libraries to test
-#include <clBLAS.h>
+#ifdef CLBLAST_REF_CLBLAS
+ #include <clBLAS.h>
+#endif
#include "clblast.h"
#include "internal/utilities.h"
@@ -40,12 +42,12 @@ class Client {
public:
// Shorthand for the routine-specific functions passed to the tester
- using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers<T>&, Queue&)>;
+ using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
using SetMetric = std::function<void(Arguments<U>&)>;
using GetMetric = std::function<size_t(const Arguments<U>&)>;
// The constructor
- Client(const Routine run_routine, const Routine run_reference,
+ Client(const Routine run_routine, const Routine run_reference1, const Routine run_reference2,
const std::vector<std::string> &options,
const GetMetric get_flops, const GetMetric get_bytes);
@@ -61,7 +63,7 @@ class Client {
private:
// Runs a function a given number of times and returns the execution time of the shortest instance
- double TimedExecution(const size_t num_runs, const Arguments<U> &args, const Buffers<T> &buffers,
+ double TimedExecution(const size_t num_runs, const Arguments<U> &args, Buffers<T> &buffers,
Queue &queue, Routine run_blas, const std::string &library_name);
// Prints the header of a performance-data table
@@ -73,7 +75,8 @@ class Client {
// The routine-specific functions passed to the tester
const Routine run_routine_;
- const Routine run_reference_;
+ const Routine run_reference1_;
+ const Routine run_reference2_;
const std::vector<std::string> options_;
const GetMetric get_flops_;
const GetMetric get_bytes_;
@@ -81,13 +84,31 @@ class Client {
// =================================================================================================
+// Bogus reference function, in case a comparison library is not available
+template <typename T, typename U>
+static StatusCode ReferenceNotAvailable(const Arguments<U> &, Buffers<T> &, Queue &) {
+ return StatusCode::kNotImplemented;
+}
+
// The interface to the performance client. This is a separate function in the header such that it
// is automatically compiled for each routine, templated by the parameter "C".
template <typename C, typename T, typename U>
void RunClient(int argc, char *argv[]) {
+ // Sets the reference to test against
+ #ifdef CLBLAST_REF_CLBLAS
+ const auto reference1 = C::RunReference1; // clBLAS when available
+ #else
+ const auto reference1 = ReferenceNotAvailable<T,U>;
+ #endif
+ #ifdef CLBLAST_REF_CBLAS
+ const auto reference2 = C::RunReference2; // CBLAS when available
+ #else
+ const auto reference2 = ReferenceNotAvailable<T,U>;
+ #endif
+
// Creates a new client
- auto client = Client<T,U>(C::RunRoutine, C::RunReference, C::GetOptions(),
+ auto client = Client<T,U>(C::RunRoutine, reference1, reference2, C::GetOptions(),
C::GetFlops, C::GetBytes);
// Simple command line argument parser with defaults
diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.h
index 50480f46..8f72f570 100644
--- a/test/routines/level1/xaxpy.h
+++ b/test/routines/level1/xaxpy.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -65,7 +70,7 @@ class TestXaxpy {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Axpy(args.n, args.alpha,
@@ -77,16 +82,33 @@ class TestXaxpy {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXaxpy(args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXaxpy(args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXaxpy(args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xcopy.h b/test/routines/level1/xcopy.h
index 8d324d88..0527ca6a 100644
--- a/test/routines/level1/xcopy.h
+++ b/test/routines/level1/xcopy.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -64,7 +69,7 @@ class TestXcopy {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Copy<T>(args.n,
@@ -76,16 +81,33 @@ class TestXcopy {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXcopy<T>(args.n,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXcopy<T>(args.n,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXcopy(args.n,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xdot.h b/test/routines/level1/xdot.h
index 04669f52..d1c34c0f 100644
--- a/test/routines/level1/xdot.h
+++ b/test/routines/level1/xdot.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -68,7 +73,7 @@ class TestXdot {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Dot<T>(args.n,
@@ -81,17 +86,37 @@ class TestXdot {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXdot<T>(args.n,
- buffers.scalar(), args.dot_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXdot<T>(args.n,
+ buffers.scalar(), args.dot_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXdot(args.n,
+ scalar_cpu, args.dot_offset,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xdotc.h b/test/routines/level1/xdotc.h
index e5b42ef4..a2742cb0 100644
--- a/test/routines/level1/xdotc.h
+++ b/test/routines/level1/xdotc.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -68,7 +73,7 @@ class TestXdotc {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Dotc<T>(args.n,
@@ -81,17 +86,37 @@ class TestXdotc {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXdotc<T>(args.n,
- buffers.scalar(), args.dot_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXdotc<T>(args.n,
+ buffers.scalar(), args.dot_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXdotc(args.n,
+ scalar_cpu, args.dot_offset,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xdotu.h b/test/routines/level1/xdotu.h
index 6430148c..06ce979e 100644
--- a/test/routines/level1/xdotu.h
+++ b/test/routines/level1/xdotu.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -68,7 +73,7 @@ class TestXdotu {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Dotu<T>(args.n,
@@ -81,17 +86,37 @@ class TestXdotu {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXdotu<T>(args.n,
- buffers.scalar(), args.dot_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXdotu<T>(args.n,
+ buffers.scalar(), args.dot_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXdotu(args.n,
+ scalar_cpu, args.dot_offset,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xnrm2.h b/test/routines/level1/xnrm2.h
index e3f77ee4..d8a0de4e 100644
--- a/test/routines/level1/xnrm2.h
+++ b/test/routines/level1/xnrm2.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -64,7 +69,7 @@ class TestXnrm2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Nrm2<T>(args.n,
@@ -76,16 +81,33 @@ class TestXnrm2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXnrm2<T>(args.n,
- buffers.scalar(), args.nrm2_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXnrm2<T>(args.n,
+ buffers.scalar(), args.nrm2_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXnrm2(args.n,
+ scalar_cpu, args.nrm2_offset,
+ x_vec_cpu, args.x_offset, args.x_inc);
+ buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xscal.h b/test/routines/level1/xscal.h
index d990afcc..35855dbd 100644
--- a/test/routines/level1/xscal.h
+++ b/test/routines/level1/xscal.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -61,7 +66,7 @@ class TestXscal {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Scal(args.n, args.alpha,
@@ -72,15 +77,29 @@ class TestXscal {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXscal(args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXscal(args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXscal(args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc);
+ buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xswap.h b/test/routines/level1/xswap.h
index 2096a2c3..ae69d3be 100644
--- a/test/routines/level1/xswap.h
+++ b/test/routines/level1/xswap.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -64,7 +69,7 @@ class TestXswap {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Swap<T>(args.n,
@@ -76,16 +81,34 @@ class TestXswap {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXswap<T>(args.n,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXswap<T>(args.n,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXswap(args.n,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h
index 0e238804..b875075d 100644
--- a/test/routines/level2/xgbmv.h
+++ b/test/routines/level2/xgbmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -76,7 +81,7 @@ class TestXgbmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gbmv(args.layout, args.a_transpose,
@@ -90,19 +95,41 @@ class TestXgbmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasTranspose>(args.a_transpose),
- args.m, args.n, args.kl, args.ku, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.m, args.n, args.kl, args.ku, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXgbmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.a_transpose),
+ args.m, args.n, args.kl, args.ku, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h
index 2924d498..a70ccd34 100644
--- a/test/routines/level2/xgemv.h
+++ b/test/routines/level2/xgemv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -76,7 +81,7 @@ class TestXgemv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gemv(args.layout, args.a_transpose,
@@ -90,19 +95,41 @@ class TestXgemv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasTranspose>(args.a_transpose),
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXgemv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.a_transpose),
+ args.m, args.n, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xger.h b/test/routines/level2/xger.h
index 98296e92..32c2a505 100644
--- a/test/routines/level2/xger.h
+++ b/test/routines/level2/xger.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -72,7 +77,7 @@ class TestXger {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Ger(args.layout,
@@ -86,18 +91,39 @@ class TestXger {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXger(static_cast<clblasOrder>(args.layout),
- args.m, args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXger(static_cast<clblasOrder>(args.layout),
+ args.m, args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXger(convertToCBLAS(args.layout),
+ args.m, args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xgerc.h b/test/routines/level2/xgerc.h
index 77258d92..4b6954f6 100644
--- a/test/routines/level2/xgerc.h
+++ b/test/routines/level2/xgerc.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -72,7 +77,7 @@ class TestXgerc {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gerc(args.layout,
@@ -86,18 +91,39 @@ class TestXgerc {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXgerc(static_cast<clblasOrder>(args.layout),
- args.m, args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXgerc(static_cast<clblasOrder>(args.layout),
+ args.m, args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXgerc(convertToCBLAS(args.layout),
+ args.m, args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xgeru.h b/test/routines/level2/xgeru.h
index e5f5f235..295e69e5 100644
--- a/test/routines/level2/xgeru.h
+++ b/test/routines/level2/xgeru.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -72,7 +77,7 @@ class TestXgeru {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Geru(args.layout,
@@ -86,18 +91,39 @@ class TestXgeru {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXgeru(static_cast<clblasOrder>(args.layout),
- args.m, args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXgeru(static_cast<clblasOrder>(args.layout),
+ args.m, args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXgeru(convertToCBLAS(args.layout),
+ args.m, args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h
index 34e1502f..e0bdc4da 100644
--- a/test/routines/level2/xhbmv.h
+++ b/test/routines/level2/xhbmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXhbmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hbmv(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXhbmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.kl, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.kl, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXhbmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.kl, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xhemv.h b/test/routines/level2/xhemv.h
index 80e22157..fa242961 100644
--- a/test/routines/level2/xhemv.h
+++ b/test/routines/level2/xhemv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXhemv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hemv(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXhemv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXhemv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhemv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXhemv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xher.h b/test/routines/level2/xher.h
index 53c4200f..7d0e8cc3 100644
--- a/test/routines/level2/xher.h
+++ b/test/routines/level2/xher.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -66,7 +71,7 @@ class TestXher {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Her(args.layout, args.triangle,
@@ -79,18 +84,37 @@ class TestXher {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXher(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXher(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXher(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xher2.h b/test/routines/level2/xher2.h
index c12ff827..445bba74 100644
--- a/test/routines/level2/xher2.h
+++ b/test/routines/level2/xher2.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXher2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Her2(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXher2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXher2(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXher2(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXher2(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h
index 8fd85b62..406e564f 100644
--- a/test/routines/level2/xhpmv.h
+++ b/test/routines/level2/xhpmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXhpmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hpmv(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXhpmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.ap_mat(), args.ap_offset,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXhpmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ ap_mat_cpu, args.ap_offset,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xhpr.h b/test/routines/level2/xhpr.h
index 03599ddc..6f56d3f3 100644
--- a/test/routines/level2/xhpr.h
+++ b/test/routines/level2/xhpr.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -66,7 +71,7 @@ class TestXhpr {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hpr(args.layout, args.triangle,
@@ -79,18 +84,37 @@ class TestXhpr {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXhpr(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.ap_mat(), args.ap_offset,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhpr(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.ap_mat(), args.ap_offset,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXhpr(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ ap_mat_cpu, args.ap_offset);
+ buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xhpr2.h b/test/routines/level2/xhpr2.h
index 68fbc76d..43889cb9 100644
--- a/test/routines/level2/xhpr2.h
+++ b/test/routines/level2/xhpr2.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXhpr2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hpr2(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXhpr2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXhpr2(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.ap_mat(), args.ap_offset,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhpr2(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.ap_mat(), args.ap_offset,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXhpr2(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ ap_mat_cpu, args.ap_offset);
+ buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h
index 5bc17e49..9a5c5140 100644
--- a/test/routines/level2/xsbmv.h
+++ b/test/routines/level2/xsbmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXsbmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Sbmv(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXsbmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.kl, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.kl, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXsbmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.kl, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h
index e335da42..913af0cd 100644
--- a/test/routines/level2/xspmv.h
+++ b/test/routines/level2/xspmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXspmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Spmv(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXspmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXspmv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.ap_mat(), args.ap_offset,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXspmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXspmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ ap_mat_cpu, args.ap_offset,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xspr.h b/test/routines/level2/xspr.h
index 819b1ca8..bab5c541 100644
--- a/test/routines/level2/xspr.h
+++ b/test/routines/level2/xspr.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -66,7 +71,7 @@ class TestXspr {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Spr(args.layout, args.triangle,
@@ -79,18 +84,37 @@ class TestXspr {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXspr(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.ap_mat(), args.ap_offset,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXspr(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.ap_mat(), args.ap_offset,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXspr(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ ap_mat_cpu, args.ap_offset);
+ buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xspr2.h b/test/routines/level2/xspr2.h
index 43d66c9e..41a04cc0 100644
--- a/test/routines/level2/xspr2.h
+++ b/test/routines/level2/xspr2.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXspr2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Spr2(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXspr2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXspr2(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.ap_mat(), args.ap_offset,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXspr2(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.ap_mat(), args.ap_offset,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXspr2(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ ap_mat_cpu, args.ap_offset);
+ buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xsymv.h b/test/routines/level2/xsymv.h
index 13473a3e..0576bc1f 100644
--- a/test/routines/level2/xsymv.h
+++ b/test/routines/level2/xsymv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXsymv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Symv(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXsymv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsymv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsymv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXsymv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xsyr.h b/test/routines/level2/xsyr.h
index 66b75c0c..062eea5a 100644
--- a/test/routines/level2/xsyr.h
+++ b/test/routines/level2/xsyr.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -66,7 +71,7 @@ class TestXsyr {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syr(args.layout, args.triangle,
@@ -79,18 +84,37 @@ class TestXsyr {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsyr(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsyr(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXsyr(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xsyr2.h b/test/routines/level2/xsyr2.h
index 32497a61..50bc3cea 100644
--- a/test/routines/level2/xsyr2.h
+++ b/test/routines/level2/xsyr2.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXsyr2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syr2(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXsyr2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsyr2(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsyr2(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXsyr2(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h
index dbdddb65..600b4131 100644
--- a/test/routines/level2/xtbmv.h
+++ b/test/routines/level2/xtbmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -65,7 +70,7 @@ class TestXtbmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
@@ -78,20 +83,41 @@ class TestXtbmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- static_cast<clblasDiag>(args.diagonal),
- args.n, args.kl,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.n, args.kl,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXtbmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ convertToCBLAS(args.diagonal),
+ args.n, args.kl,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc);
+ buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h
index 4425765e..fc0cf393 100644
--- a/test/routines/level2/xtpmv.h
+++ b/test/routines/level2/xtpmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -65,7 +70,7 @@ class TestXtpmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
@@ -78,20 +83,41 @@ class TestXtpmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- static_cast<clblasDiag>(args.diagonal),
- args.n,
- buffers.ap_mat(), args.ap_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.n,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXtpmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ convertToCBLAS(args.diagonal),
+ args.n,
+ ap_mat_cpu, args.ap_offset,
+ x_vec_cpu, args.x_offset, args.x_inc);
+ buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h
index 1c0c6fd8..fec72124 100644
--- a/test/routines/level2/xtrmv.h
+++ b/test/routines/level2/xtrmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -65,7 +70,7 @@ class TestXtrmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
@@ -78,20 +83,41 @@ class TestXtrmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- static_cast<clblasDiag>(args.diagonal),
- args.n,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.n,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXtrmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ convertToCBLAS(args.diagonal),
+ args.n,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc);
+ buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h
index 695b58b7..49a92936 100644
--- a/test/routines/level3/xgemm.h
+++ b/test/routines/level3/xgemm.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -78,7 +83,7 @@ class TestXgemm {
static Transposes GetBTransposes(const Transposes &all) { return all; }
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
@@ -92,20 +97,43 @@ class TestXgemm {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
- static_cast<clblasTranspose>(args.a_transpose),
- static_cast<clblasTranspose>(args.b_transpose),
- args.m, args.n, args.k, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasTranspose>(args.b_transpose),
+ args.m, args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ cblasXgemm(convertToCBLAS(args.layout),
+ convertToCBLAS(args.a_transpose),
+ convertToCBLAS(args.b_transpose),
+ args.m, args.n, args.k, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ b_mat_cpu, args.b_offset, args.b_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h
index 7b7134e5..40538417 100644
--- a/test/routines/level3/xhemm.h
+++ b/test/routines/level3/xhemm.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -78,7 +83,7 @@ class TestXhemm {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hemm(args.layout, args.side, args.triangle,
@@ -92,20 +97,43 @@ class TestXhemm {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXhemm(static_cast<clblasOrder>(args.layout),
- static_cast<clblasSide>(args.side),
- static_cast<clblasUplo>(args.triangle),
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhemm(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasSide>(args.side),
+ static_cast<clblasUplo>(args.triangle),
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ cblasXhemm(convertToCBLAS(args.layout),
+ convertToCBLAS(args.side),
+ convertToCBLAS(args.triangle),
+ args.m, args.n, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ b_mat_cpu, args.b_offset, args.b_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h
index a7fbfcbe..1ea2ad36 100644
--- a/test/routines/level3/xher2k.h
+++ b/test/routines/level3/xher2k.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -76,7 +81,7 @@ class TestXher2k {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto alpha2 = T{args.alpha, args.alpha};
@@ -91,21 +96,45 @@ class TestXher2k {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto alpha2 = T{args.alpha, args.alpha};
- auto status = clblasXher2k(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- args.n, args.k, alpha2,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto alpha2 = T{args.alpha, args.alpha};
+ auto status = clblasXher2k(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.n, args.k, alpha2,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ auto alpha2 = T{args.alpha, args.alpha};
+ cblasXher2k(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ args.n, args.k, alpha2,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ b_mat_cpu, args.b_offset, args.b_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h
index f097672f..75a7c405 100644
--- a/test/routines/level3/xherk.h
+++ b/test/routines/level3/xherk.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -69,7 +74,7 @@ class TestXherk {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Herk(args.layout, args.triangle, args.a_transpose,
@@ -82,19 +87,39 @@ class TestXherk {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXherk(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- args.n, args.k, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXherk(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ cblasXherk(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ args.n, args.k, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h
index 03cf5de9..f867c238 100644
--- a/test/routines/level3/xsymm.h
+++ b/test/routines/level3/xsymm.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -78,7 +83,7 @@ class TestXsymm {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Symm(args.layout, args.side, args.triangle,
@@ -92,20 +97,43 @@ class TestXsymm {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
- static_cast<clblasSide>(args.side),
- static_cast<clblasUplo>(args.triangle),
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasSide>(args.side),
+ static_cast<clblasUplo>(args.triangle),
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ cblasXsymm(convertToCBLAS(args.layout),
+ convertToCBLAS(args.side),
+ convertToCBLAS(args.triangle),
+ args.m, args.n, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ b_mat_cpu, args.b_offset, args.b_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h
index 89e77f83..be4e1851 100644
--- a/test/routines/level3/xsyr2k.h
+++ b/test/routines/level3/xsyr2k.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -76,7 +81,7 @@ class TestXsyr2k {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
@@ -90,20 +95,43 @@ class TestXsyr2k {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- args.n, args.k, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ cblasXsyr2k(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ args.n, args.k, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ b_mat_cpu, args.b_offset, args.b_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h
index 8dacb5b3..7675e2aa 100644
--- a/test/routines/level3/xsyrk.h
+++ b/test/routines/level3/xsyrk.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -69,7 +74,7 @@ class TestXsyrk {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syrk(args.layout, args.triangle, args.a_transpose,
@@ -82,19 +87,39 @@ class TestXsyrk {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- args.n, args.k, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ cblasXsyrk(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ args.n, args.k, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h
index 152cdf58..a085cb15 100644
--- a/test/routines/level3/xtrmm.h
+++ b/test/routines/level3/xtrmm.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -69,7 +74,7 @@ class TestXtrmm {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
@@ -82,21 +87,43 @@ class TestXtrmm {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout),
- static_cast<clblasSide>(args.side),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- static_cast<clblasDiag>(args.diagonal),
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasSide>(args.side),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+ cblasXtrmm(convertToCBLAS(args.layout),
+ convertToCBLAS(args.side),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ convertToCBLAS(args.diagonal),
+ args.m, args.n, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ b_mat_cpu, args.b_offset, args.b_ld);
+ buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h
new file mode 100644
index 00000000..dec272b0
--- /dev/null
+++ b/test/wrapper_cblas.h
@@ -0,0 +1,1674 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a wrapper around a CPU BLAS library, such that its routines can be called
+// in a similar way as the CLBlast routines: using alpha and beta to determine the precision.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_WRAPPER_CBLAS_H_
+#define CLBLAST_TEST_WRAPPER_CBLAS_H_
+
+#include <cblas.h>
+
+#include "internal/utilities.h"
+
+namespace clblast {
+
+// Conversions from CLBlast types
+CBLAS_ORDER convertToCBLAS(const Layout v) { return (v == Layout::kRowMajor) ? CblasRowMajor : CblasColMajor; }
+CBLAS_TRANSPOSE convertToCBLAS(const Transpose v) { return (v == Transpose::kNo) ? CblasNoTrans : (v == Transpose::kYes) ? CblasTrans : CblasConjTrans; }
+CBLAS_UPLO convertToCBLAS(const Triangle v) { return (v == Triangle::kUpper) ? CblasUpper : CblasLower; }
+CBLAS_DIAG convertToCBLAS(const Diagonal v) { return (v == Diagonal::kUnit) ? CblasUnit : CblasNonUnit; }
+CBLAS_SIDE convertToCBLAS(const Side v) { return (v == Side::kLeft) ? CblasLeft : CblasRight; }
+
+// OpenBLAS is not fully Netlib CBLAS compatible
+#ifdef OPENBLAS_VERSION
+ using return_pointer_float = openblas_complex_float*;
+ using return_pointer_double = openblas_complex_double*;
+#else
+ using return_pointer_float = void*;
+ using return_pointer_double = void*;
+#endif
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// Forwards the Netlib BLAS calls for SROTG/DROTG
+void cblasXrotg(std::vector<float>& sa_buffer, const size_t sa_offset,
+ std::vector<float>& sb_buffer, const size_t sb_offset,
+ std::vector<float>& sc_buffer, const size_t sc_offset,
+ std::vector<float>& ss_buffer, const size_t ss_offset) {
+ cblas_srotg(&sa_buffer[sa_offset],
+ &sb_buffer[sb_offset],
+ &sc_buffer[sc_offset],
+ &ss_buffer[ss_offset]);
+}
+void cblasXrotg(std::vector<double>& sa_buffer, const size_t sa_offset,
+ std::vector<double>& sb_buffer, const size_t sb_offset,
+ std::vector<double>& sc_buffer, const size_t sc_offset,
+ std::vector<double>& ss_buffer, const size_t ss_offset) {
+ cblas_drotg(&sa_buffer[sa_offset],
+ &sb_buffer[sb_offset],
+ &sc_buffer[sc_offset],
+ &ss_buffer[ss_offset]);
+}
+
+// Forwards the Netlib BLAS calls for SROTMG/DROTMG
+void cblasXrotmg(std::vector<float>& sd1_buffer, const size_t sd1_offset,
+ std::vector<float>& sd2_buffer, const size_t sd2_offset,
+ std::vector<float>& sx1_buffer, const size_t sx1_offset,
+ const std::vector<float>& sy1_buffer, const size_t sy1_offset,
+ std::vector<float>& sparam_buffer, const size_t sparam_offset) {
+ cblas_srotmg(&sd1_buffer[sd1_offset],
+ &sd2_buffer[sd2_offset],
+ &sx1_buffer[sx1_offset],
+ sy1_buffer[sy1_offset],
+ &sparam_buffer[sparam_offset]);
+}
+void cblasXrotmg(std::vector<double>& sd1_buffer, const size_t sd1_offset,
+ std::vector<double>& sd2_buffer, const size_t sd2_offset,
+ std::vector<double>& sx1_buffer, const size_t sx1_offset,
+ const std::vector<double>& sy1_buffer, const size_t sy1_offset,
+ std::vector<double>& sparam_buffer, const size_t sparam_offset) {
+ cblas_drotmg(&sd1_buffer[sd1_offset],
+ &sd2_buffer[sd2_offset],
+ &sx1_buffer[sx1_offset],
+ sy1_buffer[sy1_offset],
+ &sparam_buffer[sparam_offset]);
+}
+
+// Forwards the Netlib BLAS calls for SROT/DROT
+void cblasXrot(const size_t n,
+ std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+ const float cos,
+ const float sin) {
+ cblas_srot(n,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc),
+ cos,
+ sin);
+}
+void cblasXrot(const size_t n,
+ std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+ const double cos,
+ const double sin) {
+ cblas_drot(n,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc),
+ cos,
+ sin);
+}
+
+// Forwards the Netlib BLAS calls for SROTM/DROTM
+void cblasXrotm(const size_t n,
+ std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<float>& sparam_buffer, const size_t sparam_offset) {
+ cblas_srotm(n,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc),
+ &sparam_buffer[sparam_offset]);
+}
+void cblasXrotm(const size_t n,
+ std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<double>& sparam_buffer, const size_t sparam_offset) {
+ cblas_drotm(n,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc),
+ &sparam_buffer[sparam_offset]);
+}
+
+// Forwards the Netlib BLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP
+void cblasXswap(const size_t n,
+ std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_sswap(n,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXswap(const size_t n,
+ std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_dswap(n,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXswap(const size_t n,
+ std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_cswap(n,
+ reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXswap(const size_t n,
+ std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_zswap(n,
+ reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL
+void cblasXscal(const size_t n,
+ const float alpha,
+ std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_sscal(n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXscal(const size_t n,
+ const double alpha,
+ std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_dscal(n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXscal(const size_t n,
+ const float2 alpha,
+ std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ cblas_cscal(n,
+ alpha_array.data(),
+ reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXscal(const size_t n,
+ const double2 alpha,
+ std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ cblas_zscal(n,
+ alpha_array.data(),
+ reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY
+void cblasXcopy(const size_t n,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_scopy(n,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXcopy(const size_t n,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_dcopy(n,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXcopy(const size_t n,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_ccopy(n,
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXcopy(const size_t n,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_zcopy(n,
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY
+void cblasXaxpy(const size_t n,
+ const float alpha,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_saxpy(n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXaxpy(const size_t n,
+ const double alpha,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_daxpy(n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXaxpy(const size_t n,
+ const float2 alpha,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ cblas_caxpy(n,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXaxpy(const size_t n,
+ const double2 alpha,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ cblas_zaxpy(n,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SDOT/DDOT
+void cblasXdot(const size_t n,
+ std::vector<float>& dot_buffer, const size_t dot_offset,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ dot_buffer[dot_offset] = cblas_sdot(n,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXdot(const size_t n,
+ std::vector<double>& dot_buffer, const size_t dot_offset,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ dot_buffer[dot_offset] = cblas_ddot(n,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for CDOTU/ZDOTU
+void cblasXdotu(const size_t n,
+ std::vector<float2>& dot_buffer, const size_t dot_offset,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_cdotu_sub(n,
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+ reinterpret_cast<return_pointer_float>(&dot_buffer[dot_offset]));
+}
+void cblasXdotu(const size_t n,
+ std::vector<double2>& dot_buffer, const size_t dot_offset,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_zdotu_sub(n,
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+ reinterpret_cast<return_pointer_double>(&dot_buffer[dot_offset]));
+}
+
+// Forwards the Netlib BLAS calls for CDOTC/ZDOTC
+void cblasXdotc(const size_t n,
+ std::vector<float2>& dot_buffer, const size_t dot_offset,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_cdotc_sub(n,
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+ reinterpret_cast<return_pointer_float>(&dot_buffer[dot_offset]));
+}
+void cblasXdotc(const size_t n,
+ std::vector<double2>& dot_buffer, const size_t dot_offset,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_zdotc_sub(n,
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+ reinterpret_cast<return_pointer_double>(&dot_buffer[dot_offset]));
+}
+
+// Forwards the Netlib BLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2
+void cblasXnrm2(const size_t n,
+ std::vector<float>& nrm2_buffer, const size_t nrm2_offset,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ nrm2_buffer[nrm2_offset] = cblas_snrm2(n,
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXnrm2(const size_t n,
+ std::vector<double>& nrm2_buffer, const size_t nrm2_offset,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ nrm2_buffer[nrm2_offset] = cblas_dnrm2(n,
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXnrm2(const size_t n,
+ std::vector<float2>& nrm2_buffer, const size_t nrm2_offset,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ nrm2_buffer[nrm2_offset] = cblas_scnrm2(n,
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXnrm2(const size_t n,
+ std::vector<double2>& nrm2_buffer, const size_t nrm2_offset,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ nrm2_buffer[nrm2_offset] = cblas_dznrm2(n,
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// Forwards the Netlib BLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV
+void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+ const size_t m, const size_t n,
+ const float alpha,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_sgemv(layout, a_transpose,
+ m, n,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ beta,
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+ const size_t m, const size_t n,
+ const double alpha,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_dgemv(layout, a_transpose,
+ m, n,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ beta,
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+ const size_t m, const size_t n,
+ const float2 alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const float2 beta,
+ std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+ cblas_cgemv(layout, a_transpose,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ beta_array.data(),
+ reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+ const size_t m, const size_t n,
+ const double2 alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const double2 beta,
+ std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+ cblas_zgemv(layout, a_transpose,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ beta_array.data(),
+ reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV
+void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const float alpha,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_sgbmv(layout, a_transpose,
+ m, n, kl, ku,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ beta,
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const double alpha,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_dgbmv(layout, a_transpose,
+ m, n, kl, ku,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ beta,
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const float2 alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const float2 beta,
+ std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+ cblas_cgbmv(layout, a_transpose,
+ m, n, kl, ku,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ beta_array.data(),
+ reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const double2 alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const double2 beta,
+ std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+ cblas_zgbmv(layout, a_transpose,
+ m, n, kl, ku,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ beta_array.data(),
+ reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for CHEMV/ZHEMV
+void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const float2 alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const float2 beta,
+ std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+ cblas_chemv(layout, triangle,
+ n,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ beta_array.data(),
+ reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const double2 alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const double2 beta,
+ std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+ cblas_zhemv(layout, triangle,
+ n,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ beta_array.data(),
+ reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for CHBMV/ZHBMV
+void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n, const size_t k,
+ const float2 alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const float2 beta,
+ std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+ cblas_chbmv(layout, triangle,
+ n, k,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ beta_array.data(),
+ reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n, const size_t k,
+ const double2 alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const double2 beta,
+ std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+ cblas_zhbmv(layout, triangle,
+ n, k,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ beta_array.data(),
+ reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for CHPMV/ZHPMV
+void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const float2 alpha,
+ const std::vector<float2>& ap_buffer, const size_t ap_offset,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const float2 beta,
+ std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+ cblas_chpmv(layout, triangle,
+ n,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&ap_buffer[ap_offset]),
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ beta_array.data(),
+ reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const double2 alpha,
+ const std::vector<double2>& ap_buffer, const size_t ap_offset,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const double2 beta,
+ std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+ cblas_zhpmv(layout, triangle,
+ n,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&ap_buffer[ap_offset]),
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ beta_array.data(),
+ reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SSYMV/DSYMV
+void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const float alpha,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_ssymv(layout, triangle,
+ n,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ beta,
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const double alpha,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_dsymv(layout, triangle,
+ n,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ beta,
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SSBMV/DSBMV
+void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n, const size_t k,
+ const float alpha,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_ssbmv(layout, triangle,
+ n, k,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ beta,
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n, const size_t k,
+ const double alpha,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_dsbmv(layout, triangle,
+ n, k,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ beta,
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SSPMV/DSPMV
+void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const float alpha,
+ const std::vector<float>& ap_buffer, const size_t ap_offset,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const float beta,
+ std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_sspmv(layout, triangle,
+ n,
+ alpha,
+ &ap_buffer[ap_offset],
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ beta,
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const double alpha,
+ const std::vector<double>& ap_buffer, const size_t ap_offset,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const double beta,
+ std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+ cblas_dspmv(layout, triangle,
+ n,
+ alpha,
+ &ap_buffer[ap_offset],
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ beta,
+ &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for STRMV/DTRMV/CTRMV/ZTRMV
+void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_strmv(layout, triangle, a_transpose, diagonal,
+ n,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_dtrmv(layout, triangle, a_transpose, diagonal,
+ n,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_ctrmv(layout, triangle, a_transpose, diagonal,
+ n,
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_ztrmv(layout, triangle, a_transpose, diagonal,
+ n,
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for STBMV/DTBMV/CTBMV/ZTBMV
+void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n, const size_t k,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_stbmv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n, const size_t k,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_dtbmv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n, const size_t k,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_ctbmv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n, const size_t k,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_ztbmv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for STPMV/DTPMV/CTPMV/ZTPMV
+void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<float>& ap_buffer, const size_t ap_offset,
+ std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_stpmv(layout, triangle, a_transpose, diagonal,
+ n,
+ &ap_buffer[ap_offset],
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<double>& ap_buffer, const size_t ap_offset,
+ std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_dtpmv(layout, triangle, a_transpose, diagonal,
+ n,
+ &ap_buffer[ap_offset],
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<float2>& ap_buffer, const size_t ap_offset,
+ std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_ctpmv(layout, triangle, a_transpose, diagonal,
+ n,
+ reinterpret_cast<const float*>(&ap_buffer[ap_offset]),
+ reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<double2>& ap_buffer, const size_t ap_offset,
+ std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_ztpmv(layout, triangle, a_transpose, diagonal,
+ n,
+ reinterpret_cast<const double*>(&ap_buffer[ap_offset]),
+ reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for STRSV/DTRSV/CTRSV/ZTRSV
+void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_strsv(layout, triangle, a_transpose, diagonal,
+ n,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_dtrsv(layout, triangle, a_transpose, diagonal,
+ n,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_ctrsv(layout, triangle, a_transpose, diagonal,
+ n,
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_ztrsv(layout, triangle, a_transpose, diagonal,
+ n,
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for STBSV/DTBSV/CTBSV/ZTBSV
+void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n, const size_t k,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_stbsv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n, const size_t k,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_dtbsv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ &a_buffer[a_offset], a_ld,
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n, const size_t k,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_ctbsv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n, const size_t k,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_ztbsv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for STPSV/DTPSV/CTPSV/ZTPSV
+void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<float>& ap_buffer, const size_t ap_offset,
+ std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_stpsv(layout, triangle, a_transpose, diagonal,
+ n,
+ &ap_buffer[ap_offset],
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<double>& ap_buffer, const size_t ap_offset,
+ std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_dtpsv(layout, triangle, a_transpose, diagonal,
+ n,
+ &ap_buffer[ap_offset],
+ &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<float2>& ap_buffer, const size_t ap_offset,
+ std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_ctpsv(layout, triangle, a_transpose, diagonal,
+ n,
+ reinterpret_cast<const float*>(&ap_buffer[ap_offset]),
+ reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t n,
+ const std::vector<double2>& ap_buffer, const size_t ap_offset,
+ std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+ cblas_ztpsv(layout, triangle, a_transpose, diagonal,
+ n,
+ reinterpret_cast<const double*>(&ap_buffer[ap_offset]),
+ reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for SGER/DGER
+void cblasXger(const CBLAS_ORDER layout,
+ const size_t m, const size_t n,
+ const float alpha,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ cblas_sger(layout,
+ m, n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc),
+ &a_buffer[a_offset], a_ld);
+}
+void cblasXger(const CBLAS_ORDER layout,
+ const size_t m, const size_t n,
+ const double alpha,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ cblas_dger(layout,
+ m, n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc),
+ &a_buffer[a_offset], a_ld);
+}
+
+// Forwards the Netlib BLAS calls for CGERU/ZGERU
+void cblasXgeru(const CBLAS_ORDER layout,
+ const size_t m, const size_t n,
+ const float2 alpha,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ cblas_cgeru(layout,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+ reinterpret_cast<float*>(&a_buffer[a_offset]), a_ld);
+}
+void cblasXgeru(const CBLAS_ORDER layout,
+ const size_t m, const size_t n,
+ const double2 alpha,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ cblas_zgeru(layout,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+ reinterpret_cast<double*>(&a_buffer[a_offset]), a_ld);
+}
+
+// Forwards the Netlib BLAS calls for CGERC/ZGERC
+void cblasXgerc(const CBLAS_ORDER layout,
+ const size_t m, const size_t n,
+ const float2 alpha,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ cblas_cgerc(layout,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+ reinterpret_cast<float*>(&a_buffer[a_offset]), a_ld);
+}
+void cblasXgerc(const CBLAS_ORDER layout,
+ const size_t m, const size_t n,
+ const double2 alpha,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ cblas_zgerc(layout,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+ reinterpret_cast<double*>(&a_buffer[a_offset]), a_ld);
+}
+
+// Forwards the Netlib BLAS calls for CHER/ZHER
+void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const float alpha,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ cblas_cher(layout, triangle,
+ n,
+ alpha,
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<float*>(&a_buffer[a_offset]), a_ld);
+}
+void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const double alpha,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ cblas_zher(layout, triangle,
+ n,
+ alpha,
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<double*>(&a_buffer[a_offset]), a_ld);
+}
+
+// Forwards the Netlib BLAS calls for CHPR/ZHPR
+void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const float alpha,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<float2>& ap_buffer, const size_t ap_offset) {
+ cblas_chpr(layout, triangle,
+ n,
+ alpha,
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<float*>(&ap_buffer[ap_offset]));
+}
+void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const double alpha,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<double2>& ap_buffer, const size_t ap_offset) {
+ cblas_zhpr(layout, triangle,
+ n,
+ alpha,
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<double*>(&ap_buffer[ap_offset]));
+}
+
+// Forwards the Netlib BLAS calls for CHER2/ZHER2
+void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const float2 alpha,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ cblas_cher2(layout, triangle,
+ n,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+ reinterpret_cast<float*>(&a_buffer[a_offset]), a_ld);
+}
+void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const double2 alpha,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ cblas_zher2(layout, triangle,
+ n,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+ reinterpret_cast<double*>(&a_buffer[a_offset]), a_ld);
+}
+
+// Forwards the Netlib BLAS calls for CHPR2/ZHPR2
+void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const float2 alpha,
+ const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<float2>& ap_buffer, const size_t ap_offset) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ cblas_chpr2(layout, triangle,
+ n,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+ reinterpret_cast<float*>(&ap_buffer[ap_offset]));
+}
+void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const double2 alpha,
+ const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<double2>& ap_buffer, const size_t ap_offset) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ cblas_zhpr2(layout, triangle,
+ n,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+ reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+ reinterpret_cast<double*>(&ap_buffer[ap_offset]));
+}
+
+// Forwards the Netlib BLAS calls for SSYR/DSYR
+void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const float alpha,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ cblas_ssyr(layout, triangle,
+ n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &a_buffer[a_offset], a_ld);
+}
+void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const double alpha,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ cblas_dsyr(layout, triangle,
+ n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &a_buffer[a_offset], a_ld);
+}
+
+// Forwards the Netlib BLAS calls for SSPR/DSPR
+void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const float alpha,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<float>& ap_buffer, const size_t ap_offset) {
+ cblas_sspr(layout, triangle,
+ n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &ap_buffer[ap_offset]);
+}
+void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const double alpha,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ std::vector<double>& ap_buffer, const size_t ap_offset) {
+ cblas_dspr(layout, triangle,
+ n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &ap_buffer[ap_offset]);
+}
+
+// Forwards the Netlib BLAS calls for SSYR2/DSYR2
+void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const float alpha,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ cblas_ssyr2(layout, triangle,
+ n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc),
+ &a_buffer[a_offset], a_ld);
+}
+void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const double alpha,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld) {
+ cblas_dsyr2(layout, triangle,
+ n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc),
+ &a_buffer[a_offset], a_ld);
+}
+
+// Forwards the Netlib BLAS calls for SSPR2/DSPR2
+void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const float alpha,
+ const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<float>& ap_buffer, const size_t ap_offset) {
+ cblas_sspr2(layout, triangle,
+ n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc),
+ &ap_buffer[ap_offset]);
+}
+void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+ const size_t n,
+ const double alpha,
+ const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+ const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+ std::vector<double>& ap_buffer, const size_t ap_offset) {
+ cblas_dspr2(layout, triangle,
+ n,
+ alpha,
+ &x_buffer[x_offset], static_cast<int>(x_inc),
+ &y_buffer[y_offset], static_cast<int>(y_inc),
+ &ap_buffer[ap_offset]);
+}
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// Forwards the Netlib BLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM
+void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const float alpha,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ cblas_sgemm(layout, a_transpose, b_transpose,
+ m, n, k,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &b_buffer[b_offset], b_ld,
+ beta,
+ &c_buffer[c_offset], c_ld);
+}
+void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const double alpha,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ cblas_dgemm(layout, a_transpose, b_transpose,
+ m, n, k,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &b_buffer[b_offset], b_ld,
+ beta,
+ &c_buffer[c_offset], c_ld);
+}
+void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const float2 alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const float2 beta,
+ std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+ cblas_cgemm(layout, a_transpose, b_transpose,
+ m, n, k,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
+ beta_array.data(),
+ reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose,
+ const size_t m, const size_t n, const size_t k,
+ const double2 alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const double2 beta,
+ std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+ cblas_zgemm(layout, a_transpose, b_transpose,
+ m, n, k,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
+ beta_array.data(),
+ reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM
+void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+ const size_t m, const size_t n,
+ const float alpha,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ cblas_ssymm(layout, side, triangle,
+ m, n,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &b_buffer[b_offset], b_ld,
+ beta,
+ &c_buffer[c_offset], c_ld);
+}
+void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+ const size_t m, const size_t n,
+ const double alpha,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ cblas_dsymm(layout, side, triangle,
+ m, n,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &b_buffer[b_offset], b_ld,
+ beta,
+ &c_buffer[c_offset], c_ld);
+}
+void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+ const size_t m, const size_t n,
+ const float2 alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const float2 beta,
+ std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+ cblas_csymm(layout, side, triangle,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
+ beta_array.data(),
+ reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+ const size_t m, const size_t n,
+ const double2 alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const double2 beta,
+ std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+ cblas_zsymm(layout, side, triangle,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
+ beta_array.data(),
+ reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for CHEMM/ZHEMM
+void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+ const size_t m, const size_t n,
+ const float2 alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const float2 beta,
+ std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+ cblas_chemm(layout, side, triangle,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
+ beta_array.data(),
+ reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+ const size_t m, const size_t n,
+ const double2 alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const double2 beta,
+ std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+ cblas_zhemm(layout, side, triangle,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
+ beta_array.data(),
+ reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK
+void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+ const size_t n, const size_t k,
+ const float alpha,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const float beta,
+ std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ cblas_ssyrk(layout, triangle, a_transpose,
+ n, k,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ beta,
+ &c_buffer[c_offset], c_ld);
+}
+void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+ const size_t n, const size_t k,
+ const double alpha,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const double beta,
+ std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ cblas_dsyrk(layout, triangle, a_transpose,
+ n, k,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ beta,
+ &c_buffer[c_offset], c_ld);
+}
+void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+ const size_t n, const size_t k,
+ const float2 alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const float2 beta,
+ std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+ cblas_csyrk(layout, triangle, a_transpose,
+ n, k,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ beta_array.data(),
+ reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+ const size_t n, const size_t k,
+ const double2 alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const double2 beta,
+ std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+ cblas_zsyrk(layout, triangle, a_transpose,
+ n, k,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ beta_array.data(),
+ reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for CHERK/ZHERK
+void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+ const size_t n, const size_t k,
+ const float alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const float beta,
+ std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ cblas_cherk(layout, triangle, a_transpose,
+ n, k,
+ alpha,
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ beta,
+ reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+ const size_t n, const size_t k,
+ const double alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const double beta,
+ std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ cblas_zherk(layout, triangle, a_transpose,
+ n, k,
+ alpha,
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ beta,
+ reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K
+void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+ const size_t n, const size_t k,
+ const float alpha,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ cblas_ssyr2k(layout, triangle, ab_transpose,
+ n, k,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &b_buffer[b_offset], b_ld,
+ beta,
+ &c_buffer[c_offset], c_ld);
+}
+void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+ const size_t n, const size_t k,
+ const double alpha,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ cblas_dsyr2k(layout, triangle, ab_transpose,
+ n, k,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &b_buffer[b_offset], b_ld,
+ beta,
+ &c_buffer[c_offset], c_ld);
+}
+void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+ const size_t n, const size_t k,
+ const float2 alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const float2 beta,
+ std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+ cblas_csyr2k(layout, triangle, ab_transpose,
+ n, k,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
+ beta_array.data(),
+ reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+ const size_t n, const size_t k,
+ const double2 alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const double2 beta,
+ std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+ cblas_zsyr2k(layout, triangle, ab_transpose,
+ n, k,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
+ beta_array.data(),
+ reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for CHER2K/ZHER2K
+void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+ const size_t n, const size_t k,
+ const float2 alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const float beta,
+ std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ cblas_cher2k(layout, triangle, ab_transpose,
+ n, k,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
+ beta,
+ reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+ const size_t n, const size_t k,
+ const double2 alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ const std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
+ const double beta,
+ std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ cblas_zher2k(layout, triangle, ab_transpose,
+ n, k,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
+ beta,
+ reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for STRMM/DTRMM/CTRMM/ZTRMM
+void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t m, const size_t n,
+ const float alpha,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld) {
+ cblas_strmm(layout, side, triangle, a_transpose, diagonal,
+ m, n,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &b_buffer[b_offset], b_ld);
+}
+void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t m, const size_t n,
+ const double alpha,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld) {
+ cblas_dtrmm(layout, side, triangle, a_transpose, diagonal,
+ m, n,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &b_buffer[b_offset], b_ld);
+}
+void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t m, const size_t n,
+ const float2 alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ cblas_ctrmm(layout, side, triangle, a_transpose, diagonal,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<float*>(&b_buffer[b_offset]), b_ld);
+}
+void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t m, const size_t n,
+ const double2 alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ cblas_ztrmm(layout, side, triangle, a_transpose, diagonal,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld);
+}
+
+// Forwards the Netlib BLAS calls for STRSM/DTRSM/CTRSM/ZTRSM
+void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t m, const size_t n,
+ const float alpha,
+ const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld) {
+ cblas_strsm(layout, side, triangle, a_transpose, diagonal,
+ m, n,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &b_buffer[b_offset], b_ld);
+}
+void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t m, const size_t n,
+ const double alpha,
+ const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld) {
+ cblas_dtrsm(layout, side, triangle, a_transpose, diagonal,
+ m, n,
+ alpha,
+ &a_buffer[a_offset], a_ld,
+ &b_buffer[b_offset], b_ld);
+}
+void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t m, const size_t n,
+ const float2 alpha,
+ const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld) {
+ const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+ cblas_ctrsm(layout, side, triangle, a_transpose, diagonal,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<float*>(&b_buffer[b_offset]), b_ld);
+}
+void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+ const size_t m, const size_t n,
+ const double2 alpha,
+ const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+ std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld) {
+ const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+ cblas_ztrsm(layout, side, triangle, a_transpose, diagonal,
+ m, n,
+ alpha_array.data(),
+ reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+ reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld);
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_WRAPPER_CBLAS_H_
+#endif
diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h
index fb6e83aa..89b708b8 100644
--- a/test/wrapper_clblas.h
+++ b/test/wrapper_clblas.h
@@ -65,7 +65,7 @@ template <typename T>
clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
- cl_mem sy1_buffer, const size_t sy1_offset,
+ const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
@@ -73,7 +73,7 @@ template <>
clblasStatus clblasXrotmg<float>(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
- cl_mem sy1_buffer, const size_t sy1_offset,
+ const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
@@ -88,7 +88,7 @@ template <>
clblasStatus clblasXrotmg<double>(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
- cl_mem sy1_buffer, const size_t sy1_offset,
+ const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {