summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-04-16 17:53:51 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2017-04-16 17:53:51 +0200
commite3bb58f60277e70a26b2cef782945027871135d5 (patch)
tree5db934bba015b9fe16a5c52958eaa30431929484
parentf7f8ec644f51d16f888b6a7086009b79c0beef8f (diff)
Finalized support for performance testing against cuBLAS
-rw-r--r--CHANGELOG1
-rw-r--r--CMakeLists.txt11
-rw-r--r--README.md2
-rw-r--r--test/wrapper_cuda.hpp4
4 files changed, 11 insertions, 7 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 0b4e9951..6643cc32 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -8,6 +8,7 @@ Development version (next release)
- Fixed bugs in the half-precision routines HTBMV/HTPMV/HTRMV/HSYR2K/HTRMM
- Tests now also exit with an error code when OpenCL errors or compilation errors occur
- Tests now also check for the L2 error in case of half-precision
+- Clients can now test against cuBLAS on NVIDIA systems for performance comparisons (-DCUBLAS=ON)
- Replaced the R graph scripts with Python/Matplotlib scripts
- Various minor fixes and enhancements
- Added tuned parameters for various devices (see README)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0fb04071..b26de79a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ option(TUNERS "Enable compilation of the tuners" OFF)
option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
option(TESTS "Enable compilation of the correctness tests" OFF)
option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
+option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF)
# Compile in verbose mode with additional diagnostic messages
option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
@@ -134,14 +135,16 @@ endif()
if(CLIENTS OR TESTS)
find_package(clBLAS)
find_package(CBLAS)
- find_package(cuBLAS)
- if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND AND NOT CUBLAS_FOUND)
+ if(CUBLAS)
+ find_package(cuBLAS)
+ endif()
+ if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
if(TESTS)
- message(STATUS "Could NOT find clBLAS nor a CPU BLAS nor cuBLAS, disabling the compilation of the tests")
+ message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
set(TESTS OFF)
endif()
if(CLIENTS)
- message(STATUS "Could NOT find clBLAS nor a CPU BLAS nor cuBLAS, head-to-head performance comparison not supported in the clients")
+ message(STATUS "Could NOT find clBLAS nor a CPU BLAS, head-to-head performance comparison not supported in the clients")
endif()
endif()
endif()
diff --git a/README.md b/README.md
index 3109b4bf..835f5eea 100644
--- a/README.md
+++ b/README.md
@@ -199,7 +199,7 @@ All tests can be run directly together in one go through the `make alltests` tar
Compiling the performance tests/clients (optional)
-------------
-To test the performance of CLBlast and compare optionally against [clBLAS](http://github.com/clMathLibraries/clBLAS) or a CPU BLAS library (see above for requirements), compile with the clients enabled by specifying `-DCLIENTS=ON`, for example as follows:
+To test the performance of CLBlast and compare optionally against [clBLAS](http://github.com/clMathLibraries/clBLAS), cuBLAS (if testing on an NVIDIA GPU and `-DCUBLAS=ON` set), or a CPU BLAS library (see above for requirements), compile with the clients enabled by specifying `-DCLIENTS=ON`, for example as follows:
cmake -DCLIENTS=ON ..
diff --git a/test/wrapper_cuda.hpp b/test/wrapper_cuda.hpp
index 51f897c4..c97ae3ef 100644
--- a/test/wrapper_cuda.hpp
+++ b/test/wrapper_cuda.hpp
@@ -72,7 +72,7 @@ namespace clblast {
*buffer_cuda = nullptr;
}
#else
- template <typename T> void CUDAToHost(T*, const std::vector<T>&, const size_t) { }
+ template <typename T> void CUDAToHost(T**, const std::vector<T>&, const size_t) { }
#endif
// Allocates space on the CUDA device and copies in data from the host
@@ -96,7 +96,7 @@ namespace clblast {
}
}
#else
- template <typename T> void HostToCUDA(T*, const std::vector<T>&, const size_t) { }
+ template <typename T> void HostToCUDA(T**, const std::vector<T>&, const size_t) { }
#endif
// =================================================================================================