summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt10
-rw-r--r--include/internal/clpp11.h14
-rw-r--r--include/internal/utilities.h5
-rw-r--r--scripts/generator/generator.py2
-rw-r--r--test/correctness/testblas.cc45
-rw-r--r--test/correctness/testblas.h47
-rw-r--r--test/correctness/tester.cc16
-rw-r--r--test/correctness/tester.h6
-rw-r--r--test/performance/client.cc40
-rw-r--r--test/performance/client.h33
-rw-r--r--test/routines/level1/xaxpy.h46
-rw-r--r--test/routines/level1/xcopy.h46
-rw-r--r--test/routines/level1/xdot.h51
-rw-r--r--test/routines/level1/xdotc.h51
-rw-r--r--test/routines/level1/xdotu.h51
-rw-r--r--test/routines/level1/xnrm2.h46
-rw-r--r--test/routines/level1/xscal.h41
-rw-r--r--test/routines/level1/xswap.h47
-rw-r--r--test/routines/level2/xgbmv.h57
-rw-r--r--test/routines/level2/xgemv.h57
-rw-r--r--test/routines/level2/xger.h54
-rw-r--r--test/routines/level2/xgerc.h54
-rw-r--r--test/routines/level2/xgeru.h54
-rw-r--r--test/routines/level2/xhbmv.h57
-rw-r--r--test/routines/level2/xhemv.h57
-rw-r--r--test/routines/level2/xher.h52
-rw-r--r--test/routines/level2/xher2.h57
-rw-r--r--test/routines/level2/xhpmv.h57
-rw-r--r--test/routines/level2/xhpr.h52
-rw-r--r--test/routines/level2/xhpr2.h57
-rw-r--r--test/routines/level2/xsbmv.h57
-rw-r--r--test/routines/level2/xspmv.h57
-rw-r--r--test/routines/level2/xspr.h52
-rw-r--r--test/routines/level2/xspr2.h57
-rw-r--r--test/routines/level2/xsymv.h57
-rw-r--r--test/routines/level2/xsyr.h52
-rw-r--r--test/routines/level2/xsyr2.h57
-rw-r--r--test/routines/level2/xtbmv.h58
-rw-r--r--test/routines/level2/xtpmv.h58
-rw-r--r--test/routines/level2/xtrmv.h58
-rw-r--r--test/routines/level3/xgemm.h60
-rw-r--r--test/routines/level3/xhemm.h60
-rw-r--r--test/routines/level3/xher2k.h63
-rw-r--r--test/routines/level3/xherk.h55
-rw-r--r--test/routines/level3/xsymm.h60
-rw-r--r--test/routines/level3/xsyr2k.h60
-rw-r--r--test/routines/level3/xsyrk.h55
-rw-r--r--test/routines/level3/xtrmm.h61
-rw-r--r--test/wrapper_cblas.h7
49 files changed, 1691 insertions, 615 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 48aaefe9..21254ded 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -226,10 +226,20 @@ if(TESTS)
if(CLBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
+ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+ add_definitions(" /DCLBLAST_REF_CLBLAS")
+ else()
+ add_definitions(" -DCLBLAST_REF_CLBLAS")
+ endif()
endif()
if(CBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
+ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+ add_definitions(" /DCLBLAST_REF_CBLAS")
+ else()
+ add_definitions(" -DCLBLAST_REF_CBLAS")
+ endif()
endif()
# Sets the include directories
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
index aac66396..00905ef7 100644
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
@@ -465,31 +465,33 @@ class Buffer {
}
// Copies from device to host: reading the device buffer a-synchronously
- void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
+ void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
- const size_t offset = 0) {
+ const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
ReadAsync(queue, size, host.data(), offset);
}
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
- const size_t offset = 0) {
+ const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
ReadAsync(queue, size, host.data(), offset);
}
// Copies from device to host: reading the device buffer
- void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
+ void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
ReadAsync(queue, size, host, offset);
queue.Finish();
}
- void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
+ void Read(const Queue &queue, const size_t size, std::vector<T> &host,
+ const size_t offset = 0) const {
Read(queue, size, host.data(), offset);
}
- void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
+ void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
+ const size_t offset = 0) const {
Read(queue, size, host.data(), offset);
}
diff --git a/include/internal/utilities.h b/include/internal/utilities.h
index 35f76722..6adc1d0a 100644
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@@ -35,6 +35,9 @@ using double2 = std::complex<double>;
const std::string kKhronosHalfPrecision = "cl_khr_fp16";
const std::string kKhronosDoublePrecision = "cl_khr_fp64";
+// Catched an unknown error
+constexpr auto kUnknownError = -999;
+
// =================================================================================================
// The routine-specific arguments in string form
@@ -70,6 +73,7 @@ constexpr auto kArgFraction = "fraction";
// The client-specific arguments in string form
constexpr auto kArgCompareclblas = "clblas";
+constexpr auto kArgComparecblas = "cblas";
constexpr auto kArgStepSize = "step";
constexpr auto kArgNumSteps = "num_steps";
constexpr auto kArgNumRuns = "runs";
@@ -128,6 +132,7 @@ struct Arguments {
double fraction = 1.0;
// Client-specific arguments
int compare_clblas = 1;
+ int compare_cblas = 1;
size_t step = 1;
size_t num_steps = 0;
size_t num_runs = 10;
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 36a9bf40..bdf6b9d7 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -289,7 +289,7 @@ files = [
path_clblast+"/test/wrapper_clblas.h",
path_clblast+"/test/wrapper_cblas.h",
]
-header_lines = [84, 65, 93, 22, 22, 31]
+header_lines = [84, 65, 93, 22, 22, 38]
footer_lines = [6, 3, 9, 2, 6, 6]
# Checks whether the command-line arguments are valid; exists otherwise
diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc
index 1329b2c5..cc9a5adb 100644
--- a/test/correctness/testblas.cc
+++ b/test/correctness/testblas.cc
@@ -79,24 +79,6 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
// Iterates over all the to-be-tested combinations of arguments
for (auto &args: test_vector) {
- // Runs the reference clBLAS code
- auto x_vec1 = Buffer<T>(context_, args.x_size);
- auto y_vec1 = Buffer<T>(context_, args.y_size);
- auto a_mat1 = Buffer<T>(context_, args.a_size);
- auto b_mat1 = Buffer<T>(context_, args.b_size);
- auto c_mat1 = Buffer<T>(context_, args.c_size);
- auto ap_mat1 = Buffer<T>(context_, args.ap_size);
- auto scalar1 = Buffer<T>(context_, args.scalar_size);
- x_vec1.Write(queue_, args.x_size, x_source_);
- y_vec1.Write(queue_, args.y_size, y_source_);
- a_mat1.Write(queue_, args.a_size, a_source_);
- b_mat1.Write(queue_, args.b_size, b_source_);
- c_mat1.Write(queue_, args.c_size, c_source_);
- ap_mat1.Write(queue_, args.ap_size, ap_source_);
- scalar1.Write(queue_, args.scalar_size, scalar_source_);
- auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
- auto status1 = run_reference_(args, buffers1, queue_);
-
// Runs the CLBlast code
auto x_vec2 = Buffer<T>(context_, args.x_size);
auto y_vec2 = Buffer<T>(context_, args.y_size);
@@ -115,6 +97,33 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
auto status2 = run_routine_(args, buffers2, queue_);
+ #ifndef CLBLAST_REF_CLBLAS
+ // Don't continue with CBLAS if there are incorrect parameters
+ if (status2 != StatusCode::kSuccess) {
+ // TODO: Mark this as a skipped test instead of a succesfull test
+ TestErrorCodes(status2, status2, args);
+ continue;
+ }
+ #endif
+
+ // Runs the reference BLAS code
+ auto x_vec1 = Buffer<T>(context_, args.x_size);
+ auto y_vec1 = Buffer<T>(context_, args.y_size);
+ auto a_mat1 = Buffer<T>(context_, args.a_size);
+ auto b_mat1 = Buffer<T>(context_, args.b_size);
+ auto c_mat1 = Buffer<T>(context_, args.c_size);
+ auto ap_mat1 = Buffer<T>(context_, args.ap_size);
+ auto scalar1 = Buffer<T>(context_, args.scalar_size);
+ x_vec1.Write(queue_, args.x_size, x_source_);
+ y_vec1.Write(queue_, args.y_size, y_source_);
+ a_mat1.Write(queue_, args.a_size, a_source_);
+ b_mat1.Write(queue_, args.b_size, b_source_);
+ c_mat1.Write(queue_, args.c_size, c_source_);
+ ap_mat1.Write(queue_, args.ap_size, ap_source_);
+ scalar1.Write(queue_, args.scalar_size, scalar_source_);
+ auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
+ auto status1 = run_reference_(args, buffers1, queue_);
+
// Tests for equality of the two status codes
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
TestErrorCodes(status1, status2, args);
diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h
index 7c9032bd..8181aaf6 100644
--- a/test/correctness/testblas.h
+++ b/test/correctness/testblas.h
@@ -68,7 +68,7 @@ class TestBlas: public Tester<T,U> {
static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
// Shorthand for the routine-specific functions passed to the tester
- using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers<T>&, Queue&)>;
+ using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers<T>&, Queue&)>;
using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
using ResultIterator = std::function<size_t(const Arguments<U>&)>;
@@ -76,8 +76,9 @@ class TestBlas: public Tester<T,U> {
// Constructor, initializes the base class tester and input data
TestBlas(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
- const Routine run_routine, const Routine run_reference, const ResultGet get_result,
- const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2);
+ const Routine run_routine, const Routine run_reference,
+ const ResultGet get_result, const ResultIndex get_index,
+ const ResultIterator get_id1, const ResultIterator get_id2);
// The test functions, taking no inputs
void TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name);
@@ -110,9 +111,17 @@ class TestBlas: public Tester<T,U> {
template <typename C, typename T, typename U>
void RunTests(int argc, char *argv[], const bool silent, const std::string &name) {
+ // Sets the reference to test against
+ #ifdef CLBLAST_REF_CLBLAS
+ const auto reference_routine = C::RunReference1; // clBLAS when available
+ #else
+ const auto reference_routine = C::RunReference2; // otherwise CBLAS
+ #endif
+
// Creates a tester
auto options = C::GetOptions();
- TestBlas<T,U> tester{argc, argv, silent, name, options, C::RunRoutine, C::RunReference,
+ TestBlas<T,U> tester{argc, argv, silent, name, options,
+ C::RunRoutine, reference_routine,
C::DownloadResult, C::GetResultIndex, C::ResultID1, C::ResultID2};
// This variable holds the arguments relevant for this routine
@@ -250,23 +259,25 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
}
// Creates the arguments vector for the invalid-buffer tests
- auto invalid_test_vector = std::vector<Arguments<U>>{};
- auto i_args = args;
- i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
- i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize;
- for (auto &x_size: x_sizes) { i_args.x_size = x_size;
- for (auto &y_size: y_sizes) { i_args.y_size = y_size;
- for (auto &a_size: a_sizes) { i_args.a_size = a_size;
- for (auto &b_size: b_sizes) { i_args.b_size = b_size;
- for (auto &c_size: c_sizes) { i_args.c_size = c_size;
- for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size;
- invalid_test_vector.push_back(i_args);
+ #ifdef CLBLAST_REF_CLBLAS
+ auto invalid_test_vector = std::vector<Arguments<U>>{};
+ auto i_args = args;
+ i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
+ i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize;
+ for (auto &x_size: x_sizes) { i_args.x_size = x_size;
+ for (auto &y_size: y_sizes) { i_args.y_size = y_size;
+ for (auto &a_size: a_sizes) { i_args.a_size = a_size;
+ for (auto &b_size: b_sizes) { i_args.b_size = b_size;
+ for (auto &c_size: c_sizes) { i_args.c_size = c_size;
+ for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size;
+ invalid_test_vector.push_back(i_args);
+ }
}
}
}
}
}
- }
+ #endif
// Sets the name of this test-case
auto names = std::vector<std::string>{};
@@ -287,7 +298,9 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
// Runs the tests
tester.TestRegular(regular_test_vector, case_name);
- tester.TestInvalid(invalid_test_vector, case_name);
+ #ifdef CLBLAST_REF_CLBLAS
+ tester.TestInvalid(invalid_test_vector, case_name);
+ #endif
}
}
}
diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc
index 8169f700..872a131a 100644
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@@ -69,10 +69,12 @@ Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
kUnsupportedPrecision.c_str());
// Initializes clBLAS
- auto status = clblasSetup();
- if (status != CL_SUCCESS) {
- throw std::runtime_error("clBLAS setup error: "+ToString(static_cast<int>(status)));
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ auto status = clblasSetup();
+ if (status != CL_SUCCESS) {
+ throw std::runtime_error("clBLAS setup error: "+ToString(static_cast<int>(status)));
+ }
+ #endif
}
// Destructor prints the summary of the test cases and cleans-up the clBLAS library
@@ -87,7 +89,11 @@ Tester<T,U>::~Tester() {
fprintf(stdout, " %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
}
fprintf(stdout, "\n");
- clblasTeardown();
+
+ // Cleans-up clBLAS
+ #ifdef CLBLAST_REF_CLBLAS
+ clblasTeardown();
+ #endif
}
// =================================================================================================
diff --git a/test/correctness/tester.h b/test/correctness/tester.h
index db714f3d..d489f829 100644
--- a/test/correctness/tester.h
+++ b/test/correctness/tester.h
@@ -23,7 +23,9 @@
#include <memory>
// The libraries
-#include <clBLAS.h>
+#ifdef CLBLAST_REF_CLBLAS
+ #include <clBLAS.h>
+#endif
#include "clblast.h"
#include "internal/utilities.h"
@@ -92,7 +94,7 @@ class Tester {
Queue queue_;
// Whether or not to run the full test-suite or just a smoke test
- bool full_test_;
+ const bool full_test_;
// Retrieves the offset values to test with
const std::vector<size_t> GetOffsets() const;
diff --git a/test/performance/client.cc b/test/performance/client.cc
index 17f54231..56ab8c8d 100644
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@@ -24,11 +24,13 @@ namespace clblast {
// Constructor
template <typename T, typename U>
-Client<T,U>::Client(const Routine run_routine, const Routine run_reference,
+Client<T,U>::Client(const Routine run_routine,
+ const Routine run_reference1, const Routine run_reference2,
const std::vector<std::string> &options,
const GetMetric get_flops, const GetMetric get_bytes):
run_routine_(run_routine),
- run_reference_(run_reference),
+ run_reference1_(run_reference1),
+ run_reference2_(run_reference2),
options_(options),
get_flops_(get_flops),
get_bytes_(get_bytes) {
@@ -90,7 +92,16 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
- args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1);
+ #ifdef CLBLAST_REF_CLBLAS
+ args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1);
+ #else
+ args.compare_clblas = 0;
+ #endif
+ #ifdef CLBLAST_REF_CBLAS
+ args.compare_cblas = GetArgument(argc, argv, help, kArgComparecblas, 1);
+ #else
+ args.compare_cblas = 0;
+ #endif
args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1});
args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0});
args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10});
@@ -120,7 +131,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
auto device = Device(platform, args.device_id);
auto context = Context(device);
auto queue = Queue(context, device);
- if (args.compare_clblas) { clblasSetup(); }
+ #ifdef CLBLAST_REF_CLBLAS
+ if (args.compare_clblas) { clblasSetup(); }
+ #endif
// Iterates over all "num_step" values jumping by "step" each time
auto s = size_t{0};
@@ -167,9 +180,13 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
if (args.compare_clblas) {
- auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
+ auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS");
timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
}
+ if (args.compare_cblas) {
+ auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS");
+ timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas));
+ }
// Prints the performance of the tested libraries
PrintTableRow(args, timings);
@@ -186,7 +203,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
}
// Cleans-up and returns
- if (args.compare_clblas) { clblasTeardown(); }
+ #ifdef CLBLAST_REF_CLBLAS
+ if (args.compare_clblas) { clblasTeardown(); }
+ #endif
}
// =================================================================================================
@@ -196,14 +215,17 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
// value found in the vector of timing results. The return value is in milliseconds.
template <typename T, typename U>
double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
- const Buffers<T> &buffers, Queue &queue,
+ Buffers<T> &buffers, Queue &queue,
Routine run_blas, const std::string &library_name) {
auto timings = std::vector<double>(num_runs);
for (auto &timing: timings) {
auto start_time = std::chrono::steady_clock::now();
// Executes the main computation
- auto status = run_blas(args, buffers, queue);
+ auto status = StatusCode::kSuccess;
+ try {
+ status = run_blas(args, buffers, queue);
+ } catch (...) { status = static_cast<StatusCode>(kUnknownError); }
if (status != StatusCode::kSuccess) {
throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
}
@@ -226,6 +248,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); }
fprintf(stdout, " | <-- CLBlast -->");
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
+ if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); }
fprintf(stdout, " |\n");
}
@@ -233,6 +256,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); }
fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
+ if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); }
fprintf(stdout, "\n");
}
diff --git a/test/performance/client.h b/test/performance/client.h
index 5805b8a5..8d0597d7 100644
--- a/test/performance/client.h
+++ b/test/performance/client.h
@@ -26,7 +26,9 @@
#include <utility>
// The libraries to test
-#include <clBLAS.h>
+#ifdef CLBLAST_REF_CLBLAS
+ #include <clBLAS.h>
+#endif
#include "clblast.h"
#include "internal/utilities.h"
@@ -40,12 +42,12 @@ class Client {
public:
// Shorthand for the routine-specific functions passed to the tester
- using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers<T>&, Queue&)>;
+ using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
using SetMetric = std::function<void(Arguments<U>&)>;
using GetMetric = std::function<size_t(const Arguments<U>&)>;
// The constructor
- Client(const Routine run_routine, const Routine run_reference,
+ Client(const Routine run_routine, const Routine run_reference1, const Routine run_reference2,
const std::vector<std::string> &options,
const GetMetric get_flops, const GetMetric get_bytes);
@@ -61,7 +63,7 @@ class Client {
private:
// Runs a function a given number of times and returns the execution time of the shortest instance
- double TimedExecution(const size_t num_runs, const Arguments<U> &args, const Buffers<T> &buffers,
+ double TimedExecution(const size_t num_runs, const Arguments<U> &args, Buffers<T> &buffers,
Queue &queue, Routine run_blas, const std::string &library_name);
// Prints the header of a performance-data table
@@ -73,7 +75,8 @@ class Client {
// The routine-specific functions passed to the tester
const Routine run_routine_;
- const Routine run_reference_;
+ const Routine run_reference1_;
+ const Routine run_reference2_;
const std::vector<std::string> options_;
const GetMetric get_flops_;
const GetMetric get_bytes_;
@@ -81,13 +84,31 @@ class Client {
// =================================================================================================
+// Bogus reference function, in case a comparison library is not available
+template <typename T, typename U>
+static StatusCode ReferenceNotAvailable(const Arguments<U> &, Buffers<T> &, Queue &) {
+ return StatusCode::kNotImplemented;
+}
+
// The interface to the performance client. This is a separate function in the header such that it
// is automatically compiled for each routine, templated by the parameter "C".
template <typename C, typename T, typename U>
void RunClient(int argc, char *argv[]) {
+ // Sets the reference to test against
+ #ifdef CLBLAST_REF_CLBLAS
+ const auto reference1 = C::RunReference1; // clBLAS when available
+ #else
+ const auto reference1 = ReferenceNotAvailable<T,U>;
+ #endif
+ #ifdef CLBLAST_REF_CBLAS
+ const auto reference2 = C::RunReference2; // CBLAS when available
+ #else
+ const auto reference2 = ReferenceNotAvailable<T,U>;
+ #endif
+
// Creates a new client
- auto client = Client<T,U>(C::RunRoutine, C::RunReference, C::GetOptions(),
+ auto client = Client<T,U>(C::RunRoutine, reference1, reference2, C::GetOptions(),
C::GetFlops, C::GetBytes);
// Simple command line argument parser with defaults
diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.h
index 50480f46..8f72f570 100644
--- a/test/routines/level1/xaxpy.h
+++ b/test/routines/level1/xaxpy.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -65,7 +70,7 @@ class TestXaxpy {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Axpy(args.n, args.alpha,
@@ -77,16 +82,33 @@ class TestXaxpy {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXaxpy(args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXaxpy(args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXaxpy(args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xcopy.h b/test/routines/level1/xcopy.h
index 8d324d88..0527ca6a 100644
--- a/test/routines/level1/xcopy.h
+++ b/test/routines/level1/xcopy.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -64,7 +69,7 @@ class TestXcopy {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Copy<T>(args.n,
@@ -76,16 +81,33 @@ class TestXcopy {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXcopy<T>(args.n,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXcopy<T>(args.n,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXcopy(args.n,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xdot.h b/test/routines/level1/xdot.h
index 04669f52..d1c34c0f 100644
--- a/test/routines/level1/xdot.h
+++ b/test/routines/level1/xdot.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -68,7 +73,7 @@ class TestXdot {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Dot<T>(args.n,
@@ -81,17 +86,37 @@ class TestXdot {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXdot<T>(args.n,
- buffers.scalar(), args.dot_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXdot<T>(args.n,
+ buffers.scalar(), args.dot_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXdot(args.n,
+ scalar_cpu, args.dot_offset,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xdotc.h b/test/routines/level1/xdotc.h
index e5b42ef4..a2742cb0 100644
--- a/test/routines/level1/xdotc.h
+++ b/test/routines/level1/xdotc.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -68,7 +73,7 @@ class TestXdotc {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Dotc<T>(args.n,
@@ -81,17 +86,37 @@ class TestXdotc {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXdotc<T>(args.n,
- buffers.scalar(), args.dot_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXdotc<T>(args.n,
+ buffers.scalar(), args.dot_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXdotc(args.n,
+ scalar_cpu, args.dot_offset,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xdotu.h b/test/routines/level1/xdotu.h
index 6430148c..06ce979e 100644
--- a/test/routines/level1/xdotu.h
+++ b/test/routines/level1/xdotu.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -68,7 +73,7 @@ class TestXdotu {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Dotu<T>(args.n,
@@ -81,17 +86,37 @@ class TestXdotu {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXdotu<T>(args.n,
- buffers.scalar(), args.dot_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXdotu<T>(args.n,
+ buffers.scalar(), args.dot_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXdotu(args.n,
+ scalar_cpu, args.dot_offset,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xnrm2.h b/test/routines/level1/xnrm2.h
index e3f77ee4..d8a0de4e 100644
--- a/test/routines/level1/xnrm2.h
+++ b/test/routines/level1/xnrm2.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -64,7 +69,7 @@ class TestXnrm2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Nrm2<T>(args.n,
@@ -76,16 +81,33 @@ class TestXnrm2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXnrm2<T>(args.n,
- buffers.scalar(), args.nrm2_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXnrm2<T>(args.n,
+ buffers.scalar(), args.nrm2_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXnrm2(args.n,
+ scalar_cpu, args.nrm2_offset,
+ x_vec_cpu, args.x_offset, args.x_inc);
+ buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xscal.h b/test/routines/level1/xscal.h
index d990afcc..35855dbd 100644
--- a/test/routines/level1/xscal.h
+++ b/test/routines/level1/xscal.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -61,7 +66,7 @@ class TestXscal {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Scal(args.n, args.alpha,
@@ -72,15 +77,29 @@ class TestXscal {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXscal(args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXscal(args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXscal(args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc);
+ buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level1/xswap.h b/test/routines/level1/xswap.h
index 2096a2c3..ae69d3be 100644
--- a/test/routines/level1/xswap.h
+++ b/test/routines/level1/xswap.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -64,7 +69,7 @@ class TestXswap {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Swap<T>(args.n,
@@ -76,16 +81,34 @@ class TestXswap {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXswap<T>(args.n,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXswap<T>(args.n,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXswap(args.n,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h
index 0e238804..b875075d 100644
--- a/test/routines/level2/xgbmv.h
+++ b/test/routines/level2/xgbmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -76,7 +81,7 @@ class TestXgbmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gbmv(args.layout, args.a_transpose,
@@ -90,19 +95,41 @@ class TestXgbmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasTranspose>(args.a_transpose),
- args.m, args.n, args.kl, args.ku, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.m, args.n, args.kl, args.ku, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXgbmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.a_transpose),
+ args.m, args.n, args.kl, args.ku, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h
index 2924d498..a70ccd34 100644
--- a/test/routines/level2/xgemv.h
+++ b/test/routines/level2/xgemv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -76,7 +81,7 @@ class TestXgemv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gemv(args.layout, args.a_transpose,
@@ -90,19 +95,41 @@ class TestXgemv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasTranspose>(args.a_transpose),
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXgemv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.a_transpose),
+ args.m, args.n, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xger.h b/test/routines/level2/xger.h
index 98296e92..32c2a505 100644
--- a/test/routines/level2/xger.h
+++ b/test/routines/level2/xger.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -72,7 +77,7 @@ class TestXger {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Ger(args.layout,
@@ -86,18 +91,39 @@ class TestXger {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXger(static_cast<clblasOrder>(args.layout),
- args.m, args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXger(static_cast<clblasOrder>(args.layout),
+ args.m, args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXger(convertToCBLAS(args.layout),
+ args.m, args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xgerc.h b/test/routines/level2/xgerc.h
index 77258d92..4b6954f6 100644
--- a/test/routines/level2/xgerc.h
+++ b/test/routines/level2/xgerc.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -72,7 +77,7 @@ class TestXgerc {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gerc(args.layout,
@@ -86,18 +91,39 @@ class TestXgerc {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXgerc(static_cast<clblasOrder>(args.layout),
- args.m, args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXgerc(static_cast<clblasOrder>(args.layout),
+ args.m, args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXgerc(convertToCBLAS(args.layout),
+ args.m, args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xgeru.h b/test/routines/level2/xgeru.h
index e5f5f235..295e69e5 100644
--- a/test/routines/level2/xgeru.h
+++ b/test/routines/level2/xgeru.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -72,7 +77,7 @@ class TestXgeru {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Geru(args.layout,
@@ -86,18 +91,39 @@ class TestXgeru {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXgeru(static_cast<clblasOrder>(args.layout),
- args.m, args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXgeru(static_cast<clblasOrder>(args.layout),
+ args.m, args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXgeru(convertToCBLAS(args.layout),
+ args.m, args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h
index 34e1502f..e0bdc4da 100644
--- a/test/routines/level2/xhbmv.h
+++ b/test/routines/level2/xhbmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXhbmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hbmv(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXhbmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.kl, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.kl, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXhbmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.kl, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xhemv.h b/test/routines/level2/xhemv.h
index 80e22157..fa242961 100644
--- a/test/routines/level2/xhemv.h
+++ b/test/routines/level2/xhemv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXhemv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hemv(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXhemv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXhemv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhemv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXhemv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xher.h b/test/routines/level2/xher.h
index 53c4200f..7d0e8cc3 100644
--- a/test/routines/level2/xher.h
+++ b/test/routines/level2/xher.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -66,7 +71,7 @@ class TestXher {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Her(args.layout, args.triangle,
@@ -79,18 +84,37 @@ class TestXher {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXher(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXher(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXher(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xher2.h b/test/routines/level2/xher2.h
index c12ff827..445bba74 100644
--- a/test/routines/level2/xher2.h
+++ b/test/routines/level2/xher2.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXher2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Her2(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXher2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXher2(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXher2(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXher2(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h
index 8fd85b62..406e564f 100644
--- a/test/routines/level2/xhpmv.h
+++ b/test/routines/level2/xhpmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXhpmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hpmv(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXhpmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.ap_mat(), args.ap_offset,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXhpmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ ap_mat_cpu, args.ap_offset,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xhpr.h b/test/routines/level2/xhpr.h
index 03599ddc..6f56d3f3 100644
--- a/test/routines/level2/xhpr.h
+++ b/test/routines/level2/xhpr.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -66,7 +71,7 @@ class TestXhpr {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hpr(args.layout, args.triangle,
@@ -79,18 +84,37 @@ class TestXhpr {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXhpr(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.ap_mat(), args.ap_offset,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhpr(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.ap_mat(), args.ap_offset,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXhpr(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ ap_mat_cpu, args.ap_offset);
+ buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xhpr2.h b/test/routines/level2/xhpr2.h
index 68fbc76d..43889cb9 100644
--- a/test/routines/level2/xhpr2.h
+++ b/test/routines/level2/xhpr2.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXhpr2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hpr2(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXhpr2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXhpr2(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.ap_mat(), args.ap_offset,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhpr2(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.ap_mat(), args.ap_offset,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXhpr2(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ ap_mat_cpu, args.ap_offset);
+ buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h
index 5bc17e49..9a5c5140 100644
--- a/test/routines/level2/xsbmv.h
+++ b/test/routines/level2/xsbmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXsbmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Sbmv(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXsbmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.kl, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.kl, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXsbmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.kl, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h
index e335da42..913af0cd 100644
--- a/test/routines/level2/xspmv.h
+++ b/test/routines/level2/xspmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXspmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Spmv(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXspmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXspmv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.ap_mat(), args.ap_offset,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXspmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXspmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ ap_mat_cpu, args.ap_offset,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xspr.h b/test/routines/level2/xspr.h
index 819b1ca8..bab5c541 100644
--- a/test/routines/level2/xspr.h
+++ b/test/routines/level2/xspr.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -66,7 +71,7 @@ class TestXspr {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Spr(args.layout, args.triangle,
@@ -79,18 +84,37 @@ class TestXspr {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXspr(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.ap_mat(), args.ap_offset,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXspr(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.ap_mat(), args.ap_offset,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXspr(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ ap_mat_cpu, args.ap_offset);
+ buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xspr2.h b/test/routines/level2/xspr2.h
index 43d66c9e..41a04cc0 100644
--- a/test/routines/level2/xspr2.h
+++ b/test/routines/level2/xspr2.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXspr2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Spr2(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXspr2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXspr2(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.ap_mat(), args.ap_offset,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXspr2(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.ap_mat(), args.ap_offset,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXspr2(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ ap_mat_cpu, args.ap_offset);
+ buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xsymv.h b/test/routines/level2/xsymv.h
index 13473a3e..0576bc1f 100644
--- a/test/routines/level2/xsymv.h
+++ b/test/routines/level2/xsymv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXsymv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Symv(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXsymv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsymv(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsymv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXsymv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc, args.beta,
+ y_vec_cpu, args.y_offset, args.y_inc);
+ buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xsyr.h b/test/routines/level2/xsyr.h
index 66b75c0c..062eea5a 100644
--- a/test/routines/level2/xsyr.h
+++ b/test/routines/level2/xsyr.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -66,7 +71,7 @@ class TestXsyr {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syr(args.layout, args.triangle,
@@ -79,18 +84,37 @@ class TestXsyr {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsyr(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsyr(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXsyr(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xsyr2.h b/test/routines/level2/xsyr2.h
index 32497a61..50bc3cea 100644
--- a/test/routines/level2/xsyr2.h
+++ b/test/routines/level2/xsyr2.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -70,7 +75,7 @@ class TestXsyr2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syr2(args.layout, args.triangle,
@@ -84,19 +89,41 @@ class TestXsyr2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsyr2(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsyr2(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+ cblasXsyr2(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ args.n, args.alpha,
+ x_vec_cpu, args.x_offset, args.x_inc,
+ y_vec_cpu, args.y_offset, args.y_inc,
+ a_mat_cpu, args.a_offset, args.a_ld);
+ buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h
index dbdddb65..600b4131 100644
--- a/test/routines/level2/xtbmv.h
+++ b/test/routines/level2/xtbmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -65,7 +70,7 @@ class TestXtbmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
@@ -78,20 +83,41 @@ class TestXtbmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- static_cast<clblasDiag>(args.diagonal),
- args.n, args.kl,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.n, args.kl,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXtbmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ convertToCBLAS(args.diagonal),
+ args.n, args.kl,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc);
+ buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h
index 4425765e..fc0cf393 100644
--- a/test/routines/level2/xtpmv.h
+++ b/test/routines/level2/xtpmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -65,7 +70,7 @@ class TestXtpmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
@@ -78,20 +83,41 @@ class TestXtpmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- static_cast<clblasDiag>(args.diagonal),
- args.n,
- buffers.ap_mat(), args.ap_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.n,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXtpmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ convertToCBLAS(args.diagonal),
+ args.n,
+ ap_mat_cpu, args.ap_offset,
+ x_vec_cpu, args.x_offset, args.x_inc);
+ buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h
index 1c0c6fd8..fec72124 100644
--- a/test/routines/level2/xtrmv.h
+++ b/test/routines/level2/xtrmv.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -65,7 +70,7 @@ class TestXtrmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
@@ -78,20 +83,41 @@ class TestXtrmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- static_cast<clblasDiag>(args.diagonal),
- args.n,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.n,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+ cblasXtrmv(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ convertToCBLAS(args.diagonal),
+ args.n,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ x_vec_cpu, args.x_offset, args.x_inc);
+ buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h
index 695b58b7..49a92936 100644
--- a/test/routines/level3/xgemm.h
+++ b/test/routines/level3/xgemm.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -78,7 +83,7 @@ class TestXgemm {
static Transposes GetBTransposes(const Transposes &all) { return all; }
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
@@ -92,20 +97,43 @@ class TestXgemm {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
- static_cast<clblasTranspose>(args.a_transpose),
- static_cast<clblasTranspose>(args.b_transpose),
- args.m, args.n, args.k, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasTranspose>(args.b_transpose),
+ args.m, args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ cblasXgemm(convertToCBLAS(args.layout),
+ convertToCBLAS(args.a_transpose),
+ convertToCBLAS(args.b_transpose),
+ args.m, args.n, args.k, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ b_mat_cpu, args.b_offset, args.b_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h
index 7b7134e5..40538417 100644
--- a/test/routines/level3/xhemm.h
+++ b/test/routines/level3/xhemm.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -78,7 +83,7 @@ class TestXhemm {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hemm(args.layout, args.side, args.triangle,
@@ -92,20 +97,43 @@ class TestXhemm {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXhemm(static_cast<clblasOrder>(args.layout),
- static_cast<clblasSide>(args.side),
- static_cast<clblasUplo>(args.triangle),
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhemm(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasSide>(args.side),
+ static_cast<clblasUplo>(args.triangle),
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ cblasXhemm(convertToCBLAS(args.layout),
+ convertToCBLAS(args.side),
+ convertToCBLAS(args.triangle),
+ args.m, args.n, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ b_mat_cpu, args.b_offset, args.b_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h
index a7fbfcbe..1ea2ad36 100644
--- a/test/routines/level3/xher2k.h
+++ b/test/routines/level3/xher2k.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -76,7 +81,7 @@ class TestXher2k {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto alpha2 = T{args.alpha, args.alpha};
@@ -91,21 +96,45 @@ class TestXher2k {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto alpha2 = T{args.alpha, args.alpha};
- auto status = clblasXher2k(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- args.n, args.k, alpha2,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto alpha2 = T{args.alpha, args.alpha};
+ auto status = clblasXher2k(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.n, args.k, alpha2,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ auto alpha2 = T{args.alpha, args.alpha};
+ cblasXher2k(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ args.n, args.k, alpha2,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ b_mat_cpu, args.b_offset, args.b_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h
index f097672f..75a7c405 100644
--- a/test/routines/level3/xherk.h
+++ b/test/routines/level3/xherk.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -69,7 +74,7 @@ class TestXherk {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Herk(args.layout, args.triangle, args.a_transpose,
@@ -82,19 +87,39 @@ class TestXherk {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXherk(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- args.n, args.k, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXherk(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ cblasXherk(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ args.n, args.k, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h
index 03cf5de9..f867c238 100644
--- a/test/routines/level3/xsymm.h
+++ b/test/routines/level3/xsymm.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -78,7 +83,7 @@ class TestXsymm {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Symm(args.layout, args.side, args.triangle,
@@ -92,20 +97,43 @@ class TestXsymm {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
- static_cast<clblasSide>(args.side),
- static_cast<clblasUplo>(args.triangle),
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasSide>(args.side),
+ static_cast<clblasUplo>(args.triangle),
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ cblasXsymm(convertToCBLAS(args.layout),
+ convertToCBLAS(args.side),
+ convertToCBLAS(args.triangle),
+ args.m, args.n, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ b_mat_cpu, args.b_offset, args.b_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h
index 89e77f83..be4e1851 100644
--- a/test/routines/level3/xsyr2k.h
+++ b/test/routines/level3/xsyr2k.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -76,7 +81,7 @@ class TestXsyr2k {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
@@ -90,20 +95,43 @@ class TestXsyr2k {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- args.n, args.k, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ cblasXsyr2k(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ args.n, args.k, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ b_mat_cpu, args.b_offset, args.b_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h
index 8dacb5b3..7675e2aa 100644
--- a/test/routines/level3/xsyrk.h
+++ b/test/routines/level3/xsyrk.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -69,7 +74,7 @@ class TestXsyrk {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syrk(args.layout, args.triangle, args.a_transpose,
@@ -82,19 +87,39 @@ class TestXsyrk {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- args.n, args.k, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+ cblasXsyrk(convertToCBLAS(args.layout),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ args.n, args.k, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld, args.beta,
+ c_mat_cpu, args.c_offset, args.c_ld);
+ buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h
index 152cdf58..a085cb15 100644
--- a/test/routines/level3/xtrmm.h
+++ b/test/routines/level3/xtrmm.h
@@ -19,7 +19,12 @@
#include <vector>
#include <string>
-#include "wrapper_clblas.h"
+#ifdef CLBLAST_REF_CLBLAS
+ #include "wrapper_clblas.h"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+ #include "wrapper_cblas.h"
+#endif
namespace clblast {
// =================================================================================================
@@ -69,7 +74,7 @@ class TestXtrmm {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
- static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
@@ -82,21 +87,43 @@ class TestXtrmm {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
- static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout),
- static_cast<clblasSide>(args.side),
- static_cast<clblasUplo>(args.triangle),
- static_cast<clblasTranspose>(args.a_transpose),
- static_cast<clblasDiag>(args.diagonal),
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld,
- 1, &queue_plain, 0, nullptr, &event);
- clWaitForEvents(1, &event);
- return static_cast<StatusCode>(status);
- }
+ #ifdef CLBLAST_REF_CLBLAS
+ static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasSide>(args.side),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+ #endif
+
+ // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+ #ifdef CLBLAST_REF_CBLAS
+ static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+ std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+ buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+ buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+ cblasXtrmm(convertToCBLAS(args.layout),
+ convertToCBLAS(args.side),
+ convertToCBLAS(args.triangle),
+ convertToCBLAS(args.a_transpose),
+ convertToCBLAS(args.diagonal),
+ args.m, args.n, args.alpha,
+ a_mat_cpu, args.a_offset, args.a_ld,
+ b_mat_cpu, args.b_offset, args.b_ld);
+ buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
+ return StatusCode::kSuccess;
+ }
+ #endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h
index c690a45c..dec272b0 100644
--- a/test/wrapper_cblas.h
+++ b/test/wrapper_cblas.h
@@ -21,6 +21,13 @@
namespace clblast {
+// Conversions from CLBlast types
+CBLAS_ORDER convertToCBLAS(const Layout v) { return (v == Layout::kRowMajor) ? CblasRowMajor : CblasColMajor; }
+CBLAS_TRANSPOSE convertToCBLAS(const Transpose v) { return (v == Transpose::kNo) ? CblasNoTrans : (v == Transpose::kYes) ? CblasTrans : CblasConjTrans; }
+CBLAS_UPLO convertToCBLAS(const Triangle v) { return (v == Triangle::kUpper) ? CblasUpper : CblasLower; }
+CBLAS_DIAG convertToCBLAS(const Diagonal v) { return (v == Diagonal::kUnit) ? CblasUnit : CblasNonUnit; }
+CBLAS_SIDE convertToCBLAS(const Side v) { return (v == Side::kLeft) ? CblasLeft : CblasRight; }
+
// OpenBLAS is not fully Netlib CBLAS compatible
#ifdef OPENBLAS_VERSION
using return_pointer_float = openblas_complex_float*;