summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-05-30 16:38:26 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-05-30 16:38:26 +0200
commit305bf16c4c59f063bb9baa83235b964443eb495d (patch)
tree5a22e9768f4c420f44ba6b4b29be6073ac3f91ac
parent61105e38100d323ea270f2cbee0a824d401eaa77 (diff)
Separated the performance tests (clients) from the correctness tests in CMake
-rw-r--r--CHANGELOG7
-rw-r--r--CMakeLists.txt78
-rw-r--r--README.md37
3 files changed, 66 insertions, 56 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d01f674f..5d3c6d99 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,7 @@
Development version (next release)
+- Made it possible to compile the performance tests (clients) separately from the correctness tests
+- Made a reference BLAS and head-to-head performance comparison optional in the clients
- Added support for half-precision floating-point (fp16) in the library
- Added half-precision routines:
* Level-1: HSWAP/HSCAL/HCOPY/HAXPY/HDOT/HNRM2/HASUM/HSUM/iHAMAX/iHMAX/iHMIN
@@ -11,11 +13,6 @@ Version 0.7.1
- Fixed a bug in the xGEMM routine related to the event incorrectly set
- Made MSVC link the run-time libraries statically
-Version 0.7.1
-- Improved performance of large power-of-2 xGEMM kernels for AMD GPUs
-- Fixed a bug in the xGEMM routine related to the event incorrectly set
-- Made MSVC link the run-time libraries statically
-
Version 0.7.0
- Added exports to be able to create a DLL on Windows (thanks to Marco Hutter)
- Made the library thread-safe
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8a02d290..641e7966 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,8 @@ set(clblast_VERSION_PATCH 1)
# Options and their default values
option(SAMPLES "Enable compilation of the examples" OFF)
option(TUNERS "Enable compilation of the tuners" OFF)
-option(TESTS "Enable compilation of the performance and correctness tests" OFF)
+option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
+option(TESTS "Enable compilation of the correctness tests" OFF)
# ==================================================================================================
@@ -106,12 +107,17 @@ endif()
# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake"
# and "FindCBLAS.cmake" are included.
-if(TESTS)
+if(CLIENTS OR TESTS)
find_package(clBLAS)
find_package(CBLAS)
if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
- message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
- set(TESTS OFF)
+ if(TESTS)
+ message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
+ set(TESTS OFF)
+ endif()
+ if(CLIENTS)
+ message(STATUS "Could NOT find clBLAS nor a CPU BLAS, head-to-head performance comparison not supported in the clients")
+ endif()
endif()
endif()
@@ -224,9 +230,8 @@ endif()
# ==================================================================================================
-# Down from here is all test (performance and correctness) related. Note that these tests require
-# the presence of clBLAS and/or a BLAS library to act as a reference.
-if(TESTS)
+# Section for the tests: common part for both performance ('CLIENTS') and correctness ('TESTS')
+if(CLIENTS OR TESTS)
# Sets the specifics for the reference BLAS libraries
set(REF_INCLUDES )
@@ -253,6 +258,43 @@ if(TESTS)
# Sets the include directories
include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES})
+endif()
+
+# ==================================================================================================
+
+# Section for the performance tests (i.e. the client). These compare against optionally a reference
+# library, either clBLAS or a CPU BLAS.
+if(CLIENTS)
+
+ # Creates the common performance-tests objects (requires CMake 2.8.8)
+ add_library(test_performance_common OBJECT test/performance/client.cc)
+
+ # Compiles the performance-tests
+ foreach(ROUTINE ${LEVEL1_ROUTINES})
+ add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
+ test/performance/routines/level1/${ROUTINE}.cc)
+ endforeach()
+ foreach(ROUTINE ${LEVEL2_ROUTINES})
+ add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
+ test/performance/routines/level2/${ROUTINE}.cc)
+ endforeach()
+ foreach(ROUTINE ${LEVEL3_ROUTINES})
+ add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
+ test/performance/routines/level3/${ROUTINE}.cc)
+ endforeach()
+ foreach(ROUTINE ${ROUTINES})
+ target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+ install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
+ endforeach()
+
+endif()
+
+# ==================================================================================================
+
+# Section for the correctness tests. Note that these tests require the presence of clBLAS and/or a
+# CPU BLAS library to act as a reference.
+if(TESTS)
+
# Creates the common correctness-tests objects (requires CMake 2.8.8)
add_library(test_correctness_common OBJECT
test/correctness/tester.cc test/correctness/testblas.cc)
@@ -284,26 +326,6 @@ if(TESTS)
endforeach()
add_custom_target(alltests ${ALLTESTS} DEPENDS ${ALLTESTSDEPENDS})
- # Creates the common performance-tests objects (requires CMake 2.8.8)
- add_library(test_performance_common OBJECT test/performance/client.cc)
-
- # Compiles the performance-tests
- foreach(ROUTINE ${LEVEL1_ROUTINES})
- add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
- test/performance/routines/level1/${ROUTINE}.cc)
- endforeach()
- foreach(ROUTINE ${LEVEL2_ROUTINES})
- add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
- test/performance/routines/level2/${ROUTINE}.cc)
- endforeach()
- foreach(ROUTINE ${LEVEL3_ROUTINES})
- add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
- test/performance/routines/level3/${ROUTINE}.cc)
- endforeach()
- foreach(ROUTINE ${ROUTINES})
- target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
- install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
- endforeach()
-
endif()
+
# ==================================================================================================
diff --git a/README.md b/README.md
index 51c282a3..39ae5141 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ The pre-requisites for compilation of CLBlast are:
- Intel OpenCL
- Beignet
-Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either:
+Furthermore, to build the (optional) correctness tests, another BLAS library is needed to serve as a reference. This can be either:
* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS) (maintained by AMD)
* A regular CPU Netlib BLAS library, e.g.:
@@ -91,7 +91,9 @@ Or alternatively the plain C version:
#include <clblast_c.h>
-Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in `samples/`.
+Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in `samples/`. They can be compiled using the CMake infrastructure of CLBlast by providing the `-DSAMPLES=ON` flag, for example as follows:
+
+ cmake -DSAMPLES=ON ..
Using the tuners (optional)
@@ -124,7 +126,7 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- ARM Mali-T628 GPU
- Intel MIC
-If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should compile the library with the optional tuners:
+If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should compile the library with the optional tuners by specifing `-DTUNERS=ON`, for example as follows:
cmake -DTUNERS=ON ..
@@ -145,35 +147,31 @@ In summary, tuning the entire library for your device can be done as follows (st
make
-Compiling the correctness and performance tests (optional)
+Compiling the correctness tests (optional)
-------------
-To make sure CLBlast is working correctly on your device (recommended), compile with the tests enabled:
+To make sure CLBlast is working correctly on your device (recommended), compile with the tests enabled by specifying `-DTESTS=ON`, for example as follows:
cmake -DTESTS=ON ..
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is best tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. If the library clBLAS is not installed on your system, it will use a regular CPU BLAS library to test against. If both are present, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables.
-With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against clBLAS and/or a CPU BLAS library.
-
-Performance remarks
+Compiling the performance tests/clients (optional)
-------------
-The CLBlast library provides pre-tuned parameter-values for a number of OpenCL devices. If your device is not among these, then out-of-the-box performance might be poor. Even if the device is included performance might be poor in some cases: __the preview version is not thoroughly tested for performance yet__. See above under `Using the tuners` to find out how to tune for your device.
+To test the performance of CLBlast and compare optionally against clBLAS or a CPU BLAS library, compile with the clients enabled by specifying `-DCLIENTS=ON`, for example as follows:
-The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm, Xsymm, Xsyrk) show the strong points of CLBlast:
+ cmake -DCLIENTS=ON ..
-* The library reaches a high peak performance for large matrix sizes, in some cases a factor 2 more than clBLAS.
-* The performance for non-power of 2 values (e.g. 1000) is roughly equal to power of 2 cases (e.g. 1024). This is not the case for clBLAS, which sometimes shows a drop of a factor 2.
-* The performance is also constant for different layouts and transpose options. Again, this is not the case for clBLAS.
+The performance tests come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against optionally clBLAS and/or a CPU BLAS library. You can use the command-line options `-clblas 1` or `-cblas 1` to select a library to test against.
-The graphs also show the current weak points of CLBlast: for small sizes the benefit is minimal or non-existent, and for some specific configurations clBLAS is still faster.
-
-These graphs can be generated automatically on your own device. First, compile CLBlast with the tests enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `test/performance/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0:
+The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared in this case against a tuned version of the clBLAS library. These graphs can be generated automatically on your own device. First, compile CLBlast with the clients enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `test/performance/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0:
Rscript path/to/test/performance/graphs/xgemm.r 0 1
+Note that the CLBlast library provides pre-tuned parameter-values for some devices only: if your device is not among these, then out-of-the-box performance might be poor. See above under `Using the tuners` to find out how to tune for your device.
+
Supported routines
-------------
@@ -295,10 +293,3 @@ Support us
-------------
This project started in March 2015 as an evenings and weekends free-time project next to a full-time job for Cedric Nugteren. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl).
-
-
-To-do list before release of version 1.0
--------------
-
-- Add half-precision routines (e.g. HGEMM)
-- Add API documentation