Merge branch 'master' into im_to_col

author: Cedric Nugteren <web@cedricnugteren.nl> 2017-08-12 20:50:00 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2017-08-12 20:50:00 +0200
commit: 777681dcbdf18493320dd7b94fccd5c6faee9455 (patch)
tree: b8597f5d79f8ef33bffbf33f3de2548cc51d4c5c
parent: 97bcf77d4bc9b31e32a8785787e0497ac5440e44 (diff)
parent: d67fd6604b4a6584c4f9e856057fcc8076ce377d (diff)
50 files changed, 837 insertions, 423 deletions
diff --git a/.appveyor.yml b/.appveyor.yml
index a7742fa5..adb1860b 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -58,8 +58,8 @@ build_script:
 
 after_build:
   - ps: pushd $env:CLBLAST_BUILD
-  - 7z a CLBlast-Windows-x64.zip .\install_dir\*
-  - ps: mv CLBlast-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
+  - 7z a CLBlast-1.0.0-Windows-x64.zip .\install_dir\*
+  - ps: mv CLBlast-1.0.0-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
 
 artifacts:
   - path: '*.zip'
diff --git a/.travis.yml b/.travis.yml
index f628bb94..abd39aac 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,7 +21,10 @@ matrix:
 
 env:
   global:
+    - CLBLAST_VERSION=1.0.0
     - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
+    - CLBLAST_INSTALL=${TRAVIS_BUILD_DIR}/bin/CLBlast-${CLBLAST_VERSION}
+    - CLBLAST_TAR=CLBlast-${CLBLAST_VERSION}-${TRAVIS_OS_NAME}-x64.tar.gz
 
 before_install:
   - cmake --version;
@@ -29,12 +32,24 @@ before_install:
   - ${CXX} --version;
 
 before_script:
+  - mkdir -p ${CLBLAST_INSTALL}
   - mkdir -p ${CLBLAST_ROOT}
   - pushd ${CLBLAST_ROOT}
-  - cmake -DTESTS=ON -DCLIENTS=ON -DSAMPLES=ON -DNETLIB=ON ${TRAVIS_BUILD_DIR}
+  - cmake -DTESTS=ON -DCLIENTS=ON -DSAMPLES=ON -DCMAKE_INSTALL_PREFIX=${CLBLAST_INSTALL} ${TRAVIS_BUILD_DIR}
 
 script:
   - make
+  - make install
+
+after_success:
+  - pushd ${TRAVIS_BUILD_DIR}/bin
+  - rm ${CLBLAST_INSTALL}/bin/clblast_client_*
+  - rm ${CLBLAST_INSTALL}/bin/clblast_test_*
+  - echo ${CLBLAST_TAR}
+  - tar -cvf ${CLBLAST_TAR} CLBlast-${CLBLAST_VERSION}
+  - cp ${CLBLAST_TAR} ${TRAVIS_BUILD_DIR}
+  - pushd ${TRAVIS_BUILD_DIR}
+  - ls -l
 
 branches:
   only:
@@ -42,3 +57,13 @@ branches:
 
 notifications:
   email: false
+
+deploy:
+  provider: releases
+  api_key:
+    secure: oBnP56zfFTiON0v6nm6qiRevtTsojqaxV2E/+ahUP4iyZxZgn1zf9reGNEbB/s6wfHCwXpXKlCk3A0cEQzbfoZeQy3oMzyWHV/xgu+etOENe3z18oVEiVBe/WAd1/hMVmQvX65kHR+q12rce6K6rDm1mEIJC/udf5Dbdl2alVWgiL20Hrj/PSQAYZZuTmZLuMm7OBc1G2xhRmRo5FYgI2u1ZALUHDRov/yLQkoKwxAlzBhURoNTHW2wTAr3Pq01Fk2kfQFRmg7YFieu3cit/JGNzaDdgmT0U5pLRzhuPiD3qziNnC3rG7tnYV0jHQOLKH+AJ0csbNncG47JrUQrKDJGUs0fLBxHG4ErEdVc/s+l/ZTGBT6kOEjk5GLQviNuAzP51em+TATR6YJ4JdgnZEU3iwbyeY/lLPPWhOVDfUgLNVKHX7Sijf83Wp+cqspAdIcnT5lWMXUe7jciKQLC0B+jD6IQ/hCqF0/yX/H8Sa8jA+qSIrXWt/qSy1viKaQ3Sf8+rXyxG6dqYc0jUweQ248FOgUCtzmaZP48SoMBATN7JPCLzhGnY8IiMErGzc6jsevmoqB0MRqZhc2qsLEfTclxsMmfx2yVKt93G+zRMtQuYmf36MvDNbPaH+/tzE8pWxufSY0672qhL0sfvNO+FuCJ8Bsk4UwKV3lTeGjCwN5o=
+  file: ${CLBLAST_TAR}
+  skip_cleanup: true
+  on:
+    repo: CNugteren/CLBlast
+    tags: true
diff --git a/CHANGELOG b/CHANGELOG
index e7fca58c..e7e0e9ae 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,9 +1,13 @@
 
-Development (next version)
+Version 1.0.1
+- Fixed a bug in the direct version of the GEMM kernel
+
+Version 1.0.0
 - Fixed a bug in the TRSM routine for alpha != 1
 - Fixed a bug in the cache related to multi-device contexts (thanks to 'kpot')
 - Fixed a bug in the direct version of the GEMM kernel
 - Fixed several warnings for MSVC and Clang
+- Added support for Mesa Clover and AMD's ROCm by making the inline keyword optional in kernels
 - Performance reports are now external at https://cnugteren.github.io/clblast
 - Greatly improved compilation time of database.cpp
 - Various minor fixes and enhancements
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a7f1bf0c..f82af47a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,9 +17,9 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
 
 # CMake project details
 project("clblast" C CXX)
-set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 11)
-set(clblast_VERSION_PATCH 0)
+set(clblast_VERSION_MAJOR 1)
+set(clblast_VERSION_MINOR 0)
+set(clblast_VERSION_PATCH 1)
 
 # Options and their default values
 option(BUILD_SHARED_LIBS "Build a shared (ON) or static library (OFF)" ON)
@@ -101,7 +101,11 @@ else()
     set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
     set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
     set(FLAGS "${FLAGS} -Wno-deprecated-declarations")
-    set(FLAGS "${FLAGS} -Wno-undefined-var-template")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 3.9.0)  # clang 4.0 or higher
+        if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0)  # but not for AppleClang
+          set(FLAGS "${FLAGS} -Wno-undefined-var-template")
+        endif()
+    endif()
   endif()
 endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
@@ -155,6 +159,8 @@ endif()
 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
 set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
             xgemm xgemm_direct xgemv)
+set(DATABASES copy invert pad padtranspose transpose xaxpy xdot
+              xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger xtrsv)
 set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched)
 set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
 if(NETLIB)
@@ -170,7 +176,7 @@ set(PRECISIONS 32 64 3232 6464 16)
 
 # ==================================================================================================
 
-# Gathers all source-files
+# Gathers all source-files (required for the compiler) and header-files (for IDEs only)
 set(SOURCES
   src/database/database.cpp
   src/routines/common.cpp
@@ -182,27 +188,56 @@ set(SOURCES
   src/routine.cpp
   src/routines/levelx/xinvert.cpp  # only source, don't include it as a test
 )
+set(HEADERS  # such that they can be discovered by IDEs such as CLion and Visual Studio
+  include/clblast.h
+  include/clblast_c.h
+  include/clblast_half.h
+  src/database/apple_cpu_fallback.hpp
+  src/database/database.hpp
+  src/database/kernel_selection.hpp
+  src/routines/level1/xamin.hpp
+  src/routines/level1/xmax.hpp
+  src/routines/level1/xmin.hpp
+  src/routines/level1/xsum.hpp
+  src/routines/common.hpp
+  src/utilities/buffer_test.hpp
+  src/utilities/clblast_exceptions.hpp
+  src/utilities/msvc.hpp
+  src/utilities/utilities.hpp
+  src/cache.hpp
+  src/clpp11.hpp
+  src/cxpp11_common.hpp
+  src/routine.hpp
+)
 if(NETLIB)
   set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
+  set(HEADERS ${HEADERS} include/clblast_netlib_c.h)
 endif()
 foreach(ROUTINE ${LEVEL1_ROUTINES})
   set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
+  set(HEADERS ${HEADERS} src/routines/level1/${ROUTINE}.hpp)
 endforeach()
 foreach(ROUTINE ${LEVEL2_ROUTINES})
   set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cpp)
+  set(HEADERS ${HEADERS} src/routines/level2/${ROUTINE}.hpp)
 endforeach()
 foreach(ROUTINE ${LEVEL3_ROUTINES})
   set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cpp)
+  set(HEADERS ${HEADERS} src/routines/level3/${ROUTINE}.hpp)
 endforeach()
 foreach(ROUTINE ${LEVELX_ROUTINES})
   set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cpp)
+  set(HEADERS ${HEADERS} src/routines/levelx/${ROUTINE}.hpp)
+endforeach()
+foreach(DATABASE ${DATABASES})
+    set(HEADERS ${HEADERS} src/database/kernels/${DATABASE}.hpp)
 endforeach()
 
 # Creates and links the library
 if(BUILD_SHARED_LIBS)
-  add_library(clblast SHARED ${SOURCES})
+  add_library(clblast SHARED ${SOURCES} ${HEADERS})
 else(BUILD_SHARED_LIBS)
-  add_library(clblast STATIC ${SOURCES})
+  add_library(clblast STATIC ${SOURCES} ${HEADERS})
 endif()
 
 target_link_libraries(clblast ${OPENCL_LIBRARIES})
@@ -272,7 +307,7 @@ endif()
 if(TUNERS)
 
   # Visual Studio requires the sources of non-exported objects/libraries
-  set(TUNERS_COMMON )
+  set(TUNERS_COMMON src/tuning/tuning.hpp)
   if(MSVC)
     set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
   endif()
@@ -310,6 +345,7 @@ if(CLIENTS OR TESTS)
     find_package(Threads)
     set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
     set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
+    set(WRAPPERS ${WRAPPERS} test/wrapper_clblas.hpp)
     if(MSVC)
       add_definitions(" /DCLBLAST_REF_CLBLAS")
     else()
@@ -319,6 +355,7 @@ if(CLIENTS OR TESTS)
   if(CBLAS_FOUND)
     set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
     set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
+    set(WRAPPERS ${WRAPPERS} test/wrapper_cblas.hpp)
     if(MSVC)
       add_definitions(" /DCLBLAST_REF_CBLAS")
     else()
@@ -328,6 +365,7 @@ if(CLIENTS OR TESTS)
   if(CUBLAS_FOUND)
     set(REF_INCLUDES ${REF_INCLUDES} ${CUDA_INCLUDE_DIRS})
     set(REF_LIBRARIES ${REF_LIBRARIES} ${CUDA_LIBRARIES} ${CUBLAS_LIBRARIES})
+    set(WRAPPERS ${WRAPPERS} test/wrapper_cuda.hpp test/wrapper_cublas.hpp)
     if(MSVC)
       add_definitions(" /DCLBLAST_REF_CUBLAS")
     else()
@@ -342,14 +380,16 @@ endif()
 # Section for the performance tests (i.e. the client). These compare against optionally a reference
 # library, either clBLAS, a CPU BLAS, or CUDA's cuBLAS.
 if(CLIENTS)
+  set(CLIENTS_COMMON ${WRAPPERS} test/test_utilities.hpp
+                     test/performance/client.hpp test/routines/common.hpp)
 
   # Visual Studio requires the sources of non-exported objects/libraries
-  set(CLIENTS_COMMON )
   if(MSVC)
-    set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities/utilities.cpp test/performance/client.cpp)
+    set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities/utilities.cpp test/test_utilities.cpp
+                       test/performance/client.cpp)
   else()
     # Creates the common performance-tests objects (requires CMake 2.8.8)
-    add_library(test_performance_common OBJECT test/performance/client.cpp)
+    add_library(test_performance_common OBJECT test/test_utilities.cpp test/performance/client.cpp)
 
     # Adds CLBlast's interface include paths because we can't link to CLBlast here
     target_include_directories(test_performance_common PRIVATE
@@ -361,19 +401,23 @@ if(CLIENTS)
   # Compiles the performance-tests
   foreach(ROUTINE ${LEVEL1_ROUTINES})
     add_executable(clblast_client_${ROUTINE} ${CLIENTS_COMMON}
-                   test/performance/routines/level1/${ROUTINE}.cpp)
+                   test/performance/routines/level1/${ROUTINE}.cpp
+                   test/routines/level1/${ROUTINE}.hpp)
   endforeach()
   foreach(ROUTINE ${LEVEL2_ROUTINES})
     add_executable(clblast_client_${ROUTINE} ${CLIENTS_COMMON}
-                   test/performance/routines/level2/${ROUTINE}.cpp)
+                   test/performance/routines/level2/${ROUTINE}.cpp
+                   test/routines/level2/${ROUTINE}.hpp)
   endforeach()
   foreach(ROUTINE ${LEVEL3_ROUTINES})
     add_executable(clblast_client_${ROUTINE} ${CLIENTS_COMMON}
-                   test/performance/routines/level3/${ROUTINE}.cpp)
+                   test/performance/routines/level3/${ROUTINE}.cpp
+                   test/routines/level3/${ROUTINE}.hpp)
   endforeach()
   foreach(ROUTINE ${LEVELX_ROUTINES})
     add_executable(clblast_client_${ROUTINE} ${CLIENTS_COMMON}
-                   test/performance/routines/levelx/${ROUTINE}.cpp)
+                   test/performance/routines/levelx/${ROUTINE}.cpp
+                   test/routines/levelx/${ROUTINE}.hpp)
   endforeach()
   foreach(ROUTINE ${ROUTINES})
     target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
@@ -389,16 +433,17 @@ endif()
 # CPU BLAS library, and/or cuBLAS to act as a reference.
 if(TESTS)
   enable_testing()
+  set(TESTS_COMMON ${WRAPPERS} test/test_utilities.hpp test/correctness/testblas.hpp
+                   test/correctness/tester.hpp test/routines/common.hpp)
 
   # Visual Studio requires the sources of non-exported objects/libraries
-  set(TESTS_COMMON )
   if(MSVC)
-    set(TESTS_COMMON ${TESTS_COMMON} src/utilities/utilities.cpp
+    set(TESTS_COMMON ${TESTS_COMMON} src/utilities/utilities.cpp test/test_utilities.cpp
         test/correctness/tester.cpp test/correctness/testblas.cpp)
   else()
     # Creates the common correctness-tests objects (requires CMake 2.8.8)
     add_library(test_correctness_common OBJECT
-                test/correctness/tester.cpp test/correctness/testblas.cpp)
+                test/test_utilities.cpp test/correctness/tester.cpp test/correctness/testblas.cpp)
     target_include_directories(test_correctness_common PUBLIC
                                $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
                                ${clblast_SOURCE_DIR} ${REF_INCLUDES})
@@ -408,19 +453,23 @@ if(TESTS)
   # Compiles the correctness-tests
   foreach(ROUTINE ${LEVEL1_ROUTINES})
     add_executable(clblast_test_${ROUTINE} ${TESTS_COMMON}
-                   test/correctness/routines/level1/${ROUTINE}.cpp)
+                   test/correctness/routines/level1/${ROUTINE}.cpp
+                   test/routines/level1/${ROUTINE}.hpp)
   endforeach()
   foreach(ROUTINE ${LEVEL2_ROUTINES})
     add_executable(clblast_test_${ROUTINE} ${TESTS_COMMON}
-                   test/correctness/routines/level2/${ROUTINE}.cpp)
+                   test/correctness/routines/level2/${ROUTINE}.cpp
+                   test/routines/level2/${ROUTINE}.hpp)
   endforeach()
   foreach(ROUTINE ${LEVEL3_ROUTINES})
     add_executable(clblast_test_${ROUTINE} ${TESTS_COMMON}
-                   test/correctness/routines/level3/${ROUTINE}.cpp)
+                   test/correctness/routines/level3/${ROUTINE}.cpp
+                   test/routines/level3/${ROUTINE}.hpp)
   endforeach()
   foreach(ROUTINE ${LEVELX_ROUTINES})
     add_executable(clblast_test_${ROUTINE} ${TESTS_COMMON}
-                   test/correctness/routines/levelx/${ROUTINE}.cpp)
+                   test/correctness/routines/levelx/${ROUTINE}.cpp
+                   test/routines/levelx/${ROUTINE}.hpp)
   endforeach()
   foreach(ROUTINE ${ROUTINES})
     target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
diff --git a/README.md b/README.md
index 45b5ab56..2386cc88 100644
--- a/README.md
+++ b/README.md
@@ -2,14 +2,15 @@
 CLBlast: The tuned OpenCL BLAS library
 ================
 
-| | master branch |
-|-----|-----|
-| Linux/OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) |
-| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) |
+| | Build status | Tests on Intel GPU | Tests on NVIDIA GPU | Tests on AMD GPU |
+|-----|-----|-----|-----|-----|
+| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-Intel/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-Intel/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-NVIDIA/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-NVIDIA/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-AMD/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-AMD/) |
+| Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-Intel/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-Intel/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-NVIDIA/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-NVIDIA/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-AMD/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-AMD/) |
+| OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-OSX-Intel/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-OSX-Intel/) | N/A | N/A |
 
 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices. See [the CLBlast website](https://cnugteren.github.io/clblast) for performance reports on various devices as well as the latest CLBlast news.
 
-This preview-version is not yet tuned for all OpenCL devices: __if out-of-the-box performance is poor, please run the tuners first__. See below for a list of already tuned devices and instructions on how to tune yourself and contribute to future releases of the CLBlast library.
+The library is not tuned for all possible OpenCL devices: __if out-of-the-box performance is poor, please run the tuners first__. See below for a list of already tuned devices and instructions on how to tune yourself and contribute to future releases of the CLBlast library.
 
 
 Why CLBlast and not clBLAS or cuBLAS?
@@ -56,6 +57,7 @@ The pre-requisites for compilation of CLBlast are:
   - AMD APP SDK
   - Intel OpenCL
   - Beignet
+  - Mesa Clover
 
 An example of an out-of-source build using a command-line compiler and make (starting from the root of the CLBlast folder):
 
@@ -97,7 +99,7 @@ To get started quickly, a couple of stand-alone example programs are included in
 
     cmake -DSAMPLES=ON ..
 
-There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake:
+There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severely. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake:
 
     #include <clblast_netlib_c.h>
 
@@ -154,6 +156,7 @@ The CLBlast library is already tuned for the most commonly used OpenCL devices a
   - Core i7-5930K
 * Other devices:
   - ARM Mali-T628 GPU
+  - Qualcomm Adreno 330 GPU
   - Intel MIC
 
 If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should compile the library with the optional tuners by specifing `-DTUNERS=ON`, for example as follows:
@@ -279,7 +282,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
 | xTRMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
 | xTRSM    | ✔ | ✔ | ✔ | ✔ |   | (experimental, un-optimized)
 
-Futhermore, there are also batched versions of BLAS routines available, processing multiple smaller computations in one go for better performance:
+Furthermore, there are also batched versions of BLAS routines available, processing multiple smaller computations in one go for better performance:
 
 | Batched      | S | D | C | Z | H |
 | -------------|---|---|---|---|---|
@@ -302,7 +305,7 @@ Some less commonly used BLAS routines are not yet supported yet by CLBlast. They
 Half precision (fp16)
 -------------
 
-The half-precison fp16 format is a 16-bits floating-point data-type. Some OpenCL devices support the `cl_khr_fp16` extension, reducing storage and bandwidth requirements by a factor 2 compared to single-precision floating-point. In case the hardware also accelerates arithmetic on half-precision data-types, this can also greatly improve compute performance of e.g. level-3 routines such as GEMM. Devices which can benefit from this are among others Intel GPUs, ARM Mali GPUs, and NVIDIA's latest Pascal GPUs. Half-precision is in particular interest for the deep-learning community, in which convolutional neural networks can be processed much faster at a minor accuracy loss.
+The half-precision fp16 format is a 16-bits floating-point data-type. Some OpenCL devices support the `cl_khr_fp16` extension, reducing storage and bandwidth requirements by a factor 2 compared to single-precision floating-point. In case the hardware also accelerates arithmetic on half-precision data-types, this can also greatly improve compute performance of e.g. level-3 routines such as GEMM. Devices which can benefit from this are among others Intel GPUs, ARM Mali GPUs, and NVIDIA's latest Pascal GPUs. Half-precision is in particular interest for the deep-learning community, in which convolutional neural networks can be processed much faster at a minor accuracy loss.
 
 Since there is no half-precision data-type in C or C++, OpenCL provides the `cl_half` type for the host device. Unfortunately, internally this translates to a 16-bits integer, so computations on the host using this data-type should be avoided. For convenience, CLBlast provides the `clblast_half.h` header (C99 and C++ compatible), defining the `half` type as a short-hand to `cl_half` and the following basic functions:
 
@@ -317,7 +320,7 @@ Contributing
 
 Contributions are welcome in the form of tuning results for OpenCL devices previously untested or pull requests. See [the contributing guidelines](CONTRIBUTING.md) for more details.
 
-The contributing authors (code, pull requests, testing) so far are:
+The main contributing authors (code, pull requests, testing) are:
 
 * [Cedric Nugteren](http://cnugteren.github.io) - main author
 * [Anton Lokhmotov](https://github.com/psyhtest)
@@ -328,6 +331,8 @@ The contributing authors (code, pull requests, testing) so far are:
 * [Ivan Shapovalov](https://github.com/intelfx)
 * [Dimitri Van Assche](https://github.com/dvasschemacq)
 * [Shehzan Mohammed](https://shehzan10.github.io)
+* [Marco Cianfriglia](https://github.com/mcian)
+* Everyone else listed as a [GitHub contributor](https://github.com/CNugteren/CLBlast/graphs/contributors)
 
 Tuning and testing on a variety of OpenCL devices was made possible by:
 
@@ -336,6 +341,13 @@ Tuning and testing on a variety of OpenCL devices was made possible by:
 * [dividiti](http://www.dividiti.com)
 * [SURFsara HPC center](http://www.surfsara.com)
 * [ArrayFire](http://arrayfire.org)
+* Everyone reporting [tuning results](https://github.com/CNugteren/CLBlast/issues/1)
+
+Hardware/software for this project was contributed by:
+
+* [ArrayFire](http://arrayfire.org) for settings up and supporting Jenkins CI correctness tests on 7 platforms
+* [JetBrains](https://www.jetbrains.com/clion/) for supply a free CLion IDE license for CLBlast developers
+* [Travis CI](https://travis-ci.org/CNugteren/CLBlast/branches) and [AppVeyor](https://ci.appveyor.com/project/CNugteren/clblast) for free automated build tests for open-source projects
 
 
 More information
@@ -343,7 +355,7 @@ More information
 
 Further information on CLBlast is available through the following links:
 
-* A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slideset is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf).
+* A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slide-set is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf).
 * More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (May 2017). For CLTune, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.
 
 
diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp
index 44ef0d32..82c7d59d 100644
--- a/src/database/kernel_selection.hpp
+++ b/src/database/kernel_selection.hpp
@@ -52,6 +52,11 @@ const Database::DatabaseEntry KernelSelectionSingle = {
         { "default",                                         { 1280*1280*1280 } },
       }
     },
+    { 
+      kDeviceTypeGPU, "ARM", {
+        { "default",                                         { 128*128*128} },
+      }
+    }, 
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 512*512*512 } },
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index 259f95c3..e5defb32 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -30,6 +30,12 @@ const Database::DatabaseEntry CopyHalf = {
         { "default",                                         { 8, 32, 4, 8 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 32, 8, 8, 1 } },
+        { "default",                                         { 32, 8, 8, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 16, 8, 4, 4 } },
@@ -113,6 +119,12 @@ const Database::DatabaseEntry CopySingle = {
         { "default",                                         { 8, 32, 4, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 32, 8, 8, 1 } },
+        { "default",                                         { 32, 8, 8, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 32, 8, 4, 4 } },
@@ -187,6 +199,12 @@ const Database::DatabaseEntry CopyComplexSingle = {
         { "default",                                         { 32, 8, 1, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 32, 8, 1, 1 } },
+        { "default",                                         { 32, 8, 1, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 16, 8, 1, 2 } },
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index f925d07d..b6ebde43 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -30,9 +30,15 @@ const Database::DatabaseEntry PadHalf = {
         { "default",                                         { 8, 8, 2, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 16, 8, 4, 2 } },
+        { "default",                                         { 16, 8, 4, 2 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 8, 8, 2, 1 } },
+        { "default",                                         { 8, 8, 4, 1 } },
       }
     },
   }
@@ -113,6 +119,12 @@ const Database::DatabaseEntry PadSingle = {
         { "default",                                         { 32, 8, 4, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 32, 8, 2, 1 } },
+        { "default",                                         { 32, 8, 2, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 32, 8, 2, 1 } },
@@ -195,9 +207,15 @@ const Database::DatabaseEntry PadComplexSingle = {
         { "default",                                         { 32, 8, 1, 2 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 32, 8, 4, 1 } },
+        { "default",                                         { 32, 8, 4, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 32, 8, 1, 2 } },
+        { "default",                                         { 32, 8, 1, 1 } },
       }
     },
   }
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index b80a1666..bbda5c65 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -30,6 +30,12 @@ const Database::DatabaseEntry PadtransposeHalf = {
         { "default",                                         { 0, 8, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 0, 8, 8 } },
+        { "default",                                         { 0, 8, 8 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 0, 8, 1 } },
@@ -112,6 +118,12 @@ const Database::DatabaseEntry PadtransposeSingle = {
         { "default",                                         { 1, 32, 2 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 0, 8, 2 } },
+        { "default",                                         { 0, 8, 2 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 1, 16, 2 } },
@@ -194,9 +206,15 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
         { "default",                                         { 1, 16, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 0, 8, 4 } },
+        { "default",                                         { 0, 8, 4 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 1, 16, 2 } },
+        { "default",                                         { 1, 8, 2 } },
       }
     },
   }
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 446b632c..b00a23dc 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -30,9 +30,15 @@ const Database::DatabaseEntry TransposeHalf = {
         { "default",                                         { 8, 1, 0, 8 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 8, 0, 0, 4 } },
+        { "default",                                         { 8, 0, 0, 4 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 16, 0, 1, 4 } },
+        { "default",                                         { 8, 0, 1, 8 } },
       }
     },
   }
@@ -113,6 +119,12 @@ const Database::DatabaseEntry TransposeSingle = {
         { "default",                                         { 8, 1, 0, 4 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 8, 1, 1, 4 } },
+        { "default",                                         { 8, 1, 1, 4 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 8, 0, 1, 4 } },
@@ -189,6 +201,12 @@ const Database::DatabaseEntry TransposeComplexSingle = {
         { "default",                                         { 16, 1, 0, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 16, 1, 0, 1 } },
+        { "default",                                         { 16, 1, 0, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 8, 1, 1, 2 } },
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 58cde9d3..5cb225d1 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -30,9 +30,15 @@ const Database::DatabaseEntry XaxpyHalf = {
         { "default",                                         { 8, 64, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 8, 64, 1 } },
+        { "default",                                         { 8, 64, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 8, 256, 4 } },
+        { "default",                                         { 8, 64, 1 } },
       }
     },
   }
@@ -113,9 +119,15 @@ const Database::DatabaseEntry XaxpySingle = {
         { "default",                                         { 4, 1024, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 4, 128, 2 } },
+        { "default",                                         { 4, 128, 2 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 4, 256, 1 } },
+        { "default",                                         { 4, 64, 1 } },
       }
     },
   }
@@ -195,6 +207,12 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
         { "default",                                         { 1, 256, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 1, 64, 1 } },
+        { "default",                                         { 1, 64, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 1, 128, 1 } },
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index d234c558..986c32b2 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -30,9 +30,15 @@ const Database::DatabaseEntry XdotHalf = {
         { "default",                                         { 128, 32 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 64, 64 } },
+        { "default",                                         { 64, 64 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 128, 32 } },
+        { "default",                                         { 128, 64 } },
       }
     },
   }
@@ -95,6 +101,12 @@ const Database::DatabaseEntry XdotSingle = {
         { "default",                                         { 256, 64 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 128, 64 } },
+        { "default",                                         { 128, 64 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 128, 32 } },
@@ -159,6 +171,12 @@ const Database::DatabaseEntry XdotComplexSingle = {
         { "default",                                         { 512, 64 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 64, 256 } },
+        { "default",                                         { 64, 256 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 256, 32 } },
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index 2270dd44..43854afb 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -112,9 +112,15 @@ const Database::DatabaseEntry XgemmSingle = {
         { "default",                                         { 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 2 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 1 } },
+        { "default",                                         { 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
+        { "default",                                         { 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 2 } },
       }
     },
   }
diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp
index 7a1cd983..acace63f 100644
--- a/src/database/kernels/xgemm_direct.hpp
+++ b/src/database/kernels/xgemm_direct.hpp
@@ -77,9 +77,15 @@ const Database::DatabaseEntry XgemmDirectSingle = {
         { "default",                                         { 2, 8, 8, 16, 16, 1, 1, 4, 2, 32 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 2, 8, 8, 8, 8, 1, 1, 2, 1, 16 } },
+        { "default",                                         { 2, 8, 8, 8, 8, 1, 1, 2, 1, 16 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 2, 8, 8, 8, 8, 1, 1, 4, 2, 32 } },
+        { "default",                                         { 2, 8, 8, 8, 8, 1, 1, 1, 2, 16 } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 7adb6f10..c537294a 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -106,6 +106,12 @@ const Database::DatabaseEntry XgemvSingle = {
         { "default",                                         { 256, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 64, 1 } },
+        { "default",                                         { 64, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 128, 1 } },
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 8c42aa0e..c3b9103a 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -106,6 +106,12 @@ const Database::DatabaseEntry XgemvFastSingle = {
         { "default",                                         { 1, 256, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 1, 64, 4 } },
+        { "default",                                         { 1, 64, 4 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 1, 64, 1 } },
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index 644498e2..7e5905e4 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -82,6 +82,12 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
         { "default",                                         { 8, 32, 32 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 4, 64, 16 } },
+        { "default",                                         { 4, 64, 16 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 8, 32, 32 } },
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index d294ab43..e17396f6 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -30,6 +30,12 @@ const Database::DatabaseEntry XgerHalf = {
         { "default",                                         { 4, 8, 2 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 64, 4, 2 } },
+        { "default",                                         { 64, 4, 2 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 64, 1, 2 } },
@@ -101,6 +107,12 @@ const Database::DatabaseEntry XgerSingle = {
         { "default",                                         { 128, 1, 2 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 128, 1, 2 } },
+        { "default",                                         { 128, 1, 2 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 32, 4, 2 } },
@@ -171,6 +183,12 @@ const Database::DatabaseEntry XgerComplexSingle = {
         { "default",                                         { 128, 2, 2 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 64, 1, 4 } },
+        { "default",                                         { 64, 1, 4 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 64, 2, 2 } },
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index db4c8ec4..9481881e 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -235,6 +235,15 @@ R"(
 
 // =================================================================================================
 
+// Force inlining functions or not: some compilers don't support the inline keyword
+#ifdef USE_INLINE_KEYWORD
+  #define INLINE_FUNC inline
+#else
+  #define INLINE_FUNC
+#endif
+
+// =================================================================================================
+
 // Shuffled workgroup indices to avoid partition camping, see below. For specific devices, this is
 // enabled (see src/routine.cc).
 #ifndef USE_STAGGERED_INDICES
@@ -245,18 +254,18 @@ R"(
 // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
 // More details: https://github.com/CNugteren/CLBlast/issues/53
 #if USE_STAGGERED_INDICES == 1
-  inline size_t GetGroupIDFlat() {
+  INLINE_FUNC size_t GetGroupIDFlat() {
     return get_group_id(0) + get_num_groups(0) * get_group_id(1);
   }
-  inline size_t GetGroupID1() {
+  INLINE_FUNC size_t GetGroupID1() {
     return (GetGroupIDFlat()) % get_num_groups(1);
   }
-  inline size_t GetGroupID0() {
+  INLINE_FUNC size_t GetGroupID0() {
     return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0);
   }
 #else
-  inline size_t GetGroupID1() { return get_group_id(1); }
-  inline size_t GetGroupID0() { return get_group_id(0); }
+  INLINE_FUNC size_t GetGroupID1() { return get_group_id(1); }
+  INLINE_FUNC size_t GetGroupID0() { return get_group_id(0); }
 #endif
 
 // =================================================================================================
diff --git a/src/kernels/level1/level1.opencl b/src/kernels/level1/level1.opencl
index 7e10426b..3c60c54a 100644
--- a/src/kernels/level1/level1.opencl
+++ b/src/kernels/level1/level1.opencl
@@ -47,7 +47,7 @@ R"(
 // =================================================================================================
 
 // The vectorized multiply function
-inline realV MultiplyVector(realV cvec, const real aval, const realV bvec) {
+INLINE_FUNC realV MultiplyVector(realV cvec, const real aval, const realV bvec) {
   #if VW == 1
     Multiply(cvec, aval, bvec);
   #elif VW == 2
@@ -89,7 +89,7 @@ inline realV MultiplyVector(realV cvec, const real aval, const realV bvec) {
 }
 
 // The vectorized multiply-add function
-inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) {
+INLINE_FUNC realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) {
   #if VW == 1
     MultiplyAdd(cvec, aval, bvec);
   #elif VW == 2
diff --git a/src/kernels/level2/level2.opencl b/src/kernels/level2/level2.opencl
index be979766..505231ca 100644
--- a/src/kernels/level2/level2.opencl
+++ b/src/kernels/level2/level2.opencl
@@ -33,9 +33,9 @@ R"(
 // =================================================================================================
 
 // Returns an element from a vector
-inline real LoadVector(const int id, const int max,
-                       __global real* gm, const int offset, const int inc,
-                       const int do_conjugate) {
+INLINE_FUNC real LoadVector(const int id, const int max,
+                            __global real* gm, const int offset, const int inc,
+                            const int do_conjugate) {
   if (id < max) {
     real result = gm[id*inc + offset];
     if (do_conjugate) {
@@ -53,10 +53,10 @@ inline real LoadVector(const int id, const int max,
 }
 
 // Performs the rank-1 matrix update
-inline void MatrixUpdate(const int id1, const int id2, const int max1, const int max2,
-                         __global real* agm, const int a_offset, const int a_ld,
-                         const real alpha, const real xvalue, const real yvalue,
-                         const int is_upper) {
+INLINE_FUNC void MatrixUpdate(const int id1, const int id2, const int max1, const int max2,
+                              __global real* agm, const int a_offset, const int a_ld,
+                              const real alpha, const real xvalue, const real yvalue,
+                              const int is_upper) {
 
   // Bounds of a regular matrix
   if (id1 < max1 && id2 < max2) {
@@ -100,11 +100,11 @@ inline void MatrixUpdate(const int id1, const int id2, const int max1, const int
 }
 
 // Performs the rank-2 matrix update
-inline void MatrixUpdate2(const int id1, const int id2, const int max1, const int max2,
-                          __global real* agm, const int a_offset, const int a_ld,
-                          const real alpha1, const real xvalue, const real yvalue,
-                          const real alpha2, const real xtvalue, const real ytvalue,
-                          const int is_upper) {
+INLINE_FUNC void MatrixUpdate2(const int id1, const int id2, const int max1, const int max2,
+                               __global real* agm, const int a_offset, const int a_ld,
+                               const real alpha1, const real xvalue, const real yvalue,
+                               const real alpha2, const real xtvalue, const real ytvalue,
+                               const int is_upper) {
 
   // Bounds of a regular matrix
   if (id1 < max1 && id2 < max2) {
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index ff011acd..ea0478f0 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -36,9 +36,9 @@ R"(
 // =================================================================================================
 
 // Defines how to load the input matrix in the non-vectorized case
-inline real LoadMatrixA(const __global real* restrict agm, const int x, const int y,
-                        const int a_ld, const int a_offset, const int parameter,
-                        const int kl, const int ku) {
+INLINE_FUNC real LoadMatrixA(const __global real* restrict agm, const int x, const int y,
+                             const int a_ld, const int a_offset, const int parameter,
+                             const int kl, const int ku) {
   real result;
 
   // For banded matrices
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 02a1f956..8a08f076 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -75,8 +75,8 @@ R"(
 // =================================================================================================
 
 // Loads a vector input value
-inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
-                            const int a_ld) {
+INLINE_FUNC realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
+                                 const int a_ld) {
   return agm[a_ld*y + x];
 }
 
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index 93b89187..6eeadbd1 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -24,14 +24,14 @@ R"(
 // Copies a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
 // value and offset can be different.
-inline void _CopyPadMatrix(const int src_one, const int src_two,
-                           const int src_ld, const int src_offset,
-                           __global const real* restrict src,
-                           const int dest_one, const int dest_two,
-                           const int dest_ld, const int dest_offset,
-                           __global real* dest,
-                           const real alpha,
-                           const int do_conjugate) {
+INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two,
+                                const int src_ld, const int src_offset,
+                                __global const real* restrict src,
+                                const int dest_one, const int dest_two,
+                                const int dest_ld, const int dest_offset,
+                                __global real* dest,
+                                const real alpha,
+                                const int do_conjugate) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -79,15 +79,15 @@ void CopyPadMatrix(const int src_one, const int src_two,
 // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
 // writes only the actual data back to the destination matrix. Again, the ld value and offset can
 // be different.
-inline void _CopyMatrix(const int src_one, const int src_two,
-                        const int src_ld, const int src_offset,
-                        __global const real* restrict src,
-                        const int dest_one, const int dest_two,
-                        const int dest_ld, const int dest_offset,
-                        __global real* dest,
-                        const real alpha,
-                        const int upper, const int lower,
-                        const int diagonal_imag_zero) {
+INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two,
+                             const int src_ld, const int src_offset,
+                             __global const real* restrict src,
+                             const int dest_one, const int dest_two,
+                             const int dest_ld, const int dest_offset,
+                             __global real* dest,
+                             const real alpha,
+                             const int upper, const int lower,
+                             const int diagonal_imag_zero) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index 874c1510..93241700 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -164,10 +164,10 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
 // =================================================================================================
 
 // Triple matrix-multiplication kernel: C = A * B
-inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
-                         __global const real* agm, __global const real* bgm, __global real* cgm,
-                         const int lda, const int ldb, const int ldc,
-                         int current_size, int num_pages, const int block_size) {
+INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
+                              __global const real* agm, __global const real* bgm, __global real* cgm,
+                              const int lda, const int ldb, const int ldc,
+                              int current_size, int num_pages, const int block_size) {
 
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int by   = get_group_id(1) / num_pages;
@@ -250,9 +250,9 @@ inline void TripleMatMul(const int size, const bool upper, const int part, __loc
 // =================================================================================================
 
 // Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower)
-inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
-                              __global const real* src, const int a_offset, const int lda,
-                              __global real* dest, int current_size, int num_pages, const int block_size) {
+INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
+                                   __global const real* src, const int a_offset, const int lda,
+                                   __global real* dest, int current_size, int num_pages, const int block_size) {
 
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int page = get_group_id(1) % num_pages;
@@ -286,8 +286,8 @@ inline void TripleMatMulPart1(const int size, const bool upper, __local real* bl
 }
 
 // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
-inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
-                              __global real* dest, int current_size, int num_pages, const int block_size) {
+INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
+                                   __global real* dest, int current_size, int num_pages, const int block_size) {
 
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int page = get_group_id(1) % num_pages;
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index fb60ce75..49c5b9a3 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -24,15 +24,15 @@ R"(
 
 // Transposes a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the transposed source matrix dimensions.
-inline void _TransposePadMatrix(__local real* tile,
-                                const int src_one, const int src_two,
-                                const int src_ld, const int src_offset,
-                                __global const real* restrict src,
-                                const int dest_one, const int dest_two,
-                                const int dest_ld, const int dest_offset,
-                                __global real* dest,
-                                const real alpha,
-                                const int do_conjugate) {
+INLINE_FUNC void _TransposePadMatrix(__local real* tile,
+                                     const int src_one, const int src_two,
+                                     const int src_ld, const int src_offset,
+                                     __global const real* restrict src,
+                                     const int dest_one, const int dest_two,
+                                     const int dest_ld, const int dest_offset,
+                                     __global real* dest,
+                                     const real alpha,
+                                     const int do_conjugate) {
 
   // Loop over the work per thread
   #pragma unroll
@@ -105,16 +105,16 @@ void TransposePadMatrix(const int src_one, const int src_two,
 // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
 // padded source matrix, but only the actual data is written back to the transposed destination
 // matrix. This kernel optionally checks for upper/lower triangular matrices.
-inline void _TransposeMatrix(__local real* tile,
-                             const int src_one, const int src_two,
-                             const int src_ld, const int src_offset,
-                             __global const real* restrict src,
-                             const int dest_one, const int dest_two,
-                             const int dest_ld, const int dest_offset,
-                             __global real* dest,
-                             const real alpha,
-                             const int upper, const int lower,
-                             const int diagonal_imag_zero) {
+INLINE_FUNC void _TransposeMatrix(__local real* tile,
+                                  const int src_one, const int src_two,
+                                  const int src_ld, const int src_offset,
+                                  __global const real* restrict src,
+                                  const int dest_one, const int dest_two,
+                                  const int dest_ld, const int dest_offset,
+                                  __global real* dest,
+                                  const real alpha,
+                                  const int upper, const int lower,
+                                  const int diagonal_imag_zero) {
 
   // Loop over the work per thread
   #pragma unroll
diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl
index a8bd450e..8b650589 100644
--- a/src/kernels/level3/xgemm_direct_part1.opencl
+++ b/src/kernels/level3/xgemm_direct_part1.opencl
@@ -93,7 +93,7 @@ R"(
 // =================================================================================================
 
 // Initializes the accumulation registers to zero
-inline void InitAccRegistersDirect(real cpm[NWID][MWID]) {
+INLINE_FUNC void InitAccRegistersDirect(real cpm[NWID][MWID]) {
   #pragma unroll
   for (int mi=0; mi<MWID; ++mi) {
     #pragma unroll
@@ -106,7 +106,7 @@ inline void InitAccRegistersDirect(real cpm[NWID][MWID]) {
 // =================================================================================================
 
 // Performs the actual computation: Cpm += Apm * Bpm
-inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) {
+INLINE_FUNC void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) {
   #pragma unroll
   for (int ni=0; ni<NWID; ++ni) {
     #pragma unroll
@@ -120,9 +120,9 @@ inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real
 
 // Loads global off-chip memory into thread-private register files. This function is specific for
 // loading the A input matrix.
-inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID],
-                                   const int a_ld, const int a_offset, const int idm, const int idk,
-                                   const int a_transpose, const int a_conjugate) {
+INLINE_FUNC void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID],
+                                        const int a_ld, const int a_offset, const int idm, const int idk,
+                                        const int a_transpose, const int a_conjugate) {
   #pragma unroll
   for (int mi=0; mi<MWID; ++mi) {
     const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
@@ -132,9 +132,9 @@ inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[
 }
 
 // Same as above, but now for the B input matrix
-inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID],
-                                   const int b_ld, const int b_offset, const int idn, const int idk,
-                                   const int b_transpose, const int b_conjugate) {
+INLINE_FUNC void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID],
+                                        const int b_ld, const int b_offset, const int idn, const int idk,
+                                        const int b_transpose, const int b_conjugate) {
   #pragma unroll
   for (int ni=0; ni<NWID; ++ni) {
     const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
@@ -145,10 +145,10 @@ inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[
 
 // Loads global off-chip memory into thread-private register files. This function is specific for
 // loading the A input matrix. This is the same as above but now includes a bounds check.
-inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID],
-                                    const int a_ld, const int a_offset, const int idm, const int idk,
-                                    const int a_transpose, const int a_conjugate,
-                                    const int kSizeM) {
+INLINE_FUNC void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID],
+                                         const int a_ld, const int a_offset, const int idm, const int idk,
+                                         const int a_transpose, const int a_conjugate,
+                                         const int kSizeM) {
   #pragma unroll
   for (int mi=0; mi<MWID; ++mi) {
     if (idm + mi < kSizeM) {
@@ -163,10 +163,10 @@ inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm
 }
 
 // Same as above, but now for the B input matrix
-inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID],
-                                    const int b_ld, const int b_offset, const int idn, const int idk,
-                                    const int b_transpose, const int b_conjugate,
-                                    const int kSizeN) {
+INLINE_FUNC void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID],
+                                         const int b_ld, const int b_offset, const int idn, const int idk,
+                                         const int b_transpose, const int b_conjugate,
+                                         const int kSizeN) {
   #pragma unroll
   for (int ni=0; ni<NWID; ++ni) {
     if (idn + ni < kSizeN) {
@@ -184,8 +184,8 @@ inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm
 
 // Caches on-chip local memory into per-thread private memory (registers). This function is specific
 // for caching the A input matrix.
-inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg,
-                                  const int a_transpose) {
+INLINE_FUNC void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg,
+                                       const int a_transpose) {
   #pragma unroll
   for (int mi=0; mi<MWID; ++mi) {
     const int mg = mi + get_local_id(0)*MWID;
@@ -195,8 +195,8 @@ inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int k
 }
 
 // Same as above, but now for the B input matrix
-inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg,
-                                  const int b_transpose) {
+INLINE_FUNC void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg,
+                                       const int b_transpose) {
   #pragma unroll
   for (int ni=0; ni<NWID; ++ni) {
     const int ng = ni + get_local_id(1)*NWID;
@@ -209,10 +209,10 @@ inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int k
 
 // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
 // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
-inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
-                               const int idm, const int idn,
-                               const real alpha, const real beta,
-                               const int c_ld, const int c_offset, const int c_transpose) {
+INLINE_FUNC void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
+                                    const int idm, const int idn,
+                                    const real alpha, const real beta,
+                                    const int c_ld, const int c_offset, const int c_transpose) {
   #pragma unroll
   for (int ni=0; ni<NWID; ++ni) {
     #pragma unroll
@@ -237,10 +237,10 @@ inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
 
 // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
 // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
-inline void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID],
-                                const int idm, const int idn, const int kSizeM, const int kSizeN,
-                                const real alpha, const real beta,
-                                const int c_ld, const int c_offset, const int c_transpose) {
+INLINE_FUNC void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID],
+                                     const int idm, const int idn, const int kSizeM, const int kSizeN,
+                                     const real alpha, const real beta,
+                                     const int c_ld, const int c_offset, const int c_transpose) {
   #pragma unroll
   for (int ni=0; ni<NWID; ++ni) {
     #pragma unroll
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
index 3af14bff..1d9330fc 100644
--- a/src/kernels/level3/xgemm_direct_part2.opencl
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -19,9 +19,9 @@ R"(
 
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
-inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
-                                 const int a_ld, const int a_offset, const int kwg,
-                                 const int a_transpose, const int a_conjugate) {
+INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
+                                      const int a_ld, const int a_offset, const int kwg,
+                                      const int a_transpose, const int a_conjugate) {
   #if MDIMCD == MDIMAD
     const int la0 = get_local_id(0);
     const int la1 = get_local_id(1);
@@ -90,9 +90,9 @@ inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local re
 }
 
 // Same as above, but now for the B input matrix
-inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
-                                 const int b_ld, const int b_offset, const int kwg,
-                                 const int b_transpose, const int b_conjugate) {
+INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
+                                      const int b_ld, const int b_offset, const int kwg,
+                                      const int b_transpose, const int b_conjugate) {
   #if MDIMCD == NDIMBD
     const int lb0 = get_local_id(0);
     const int lb1 = get_local_id(1);
@@ -165,9 +165,9 @@ inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local re
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix. In contrast to the functions above, this function performs doesn't
 // use the vector data-types.
-inline void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
-                                 const int a_ld, const int a_offset, const int kwg,
-                                 const int a_transpose, const int a_conjugate) {
+INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
+                                      const int a_ld, const int a_offset, const int kwg,
+                                      const int a_transpose, const int a_conjugate) {
   #if MDIMCD == MDIMAD
     const int la0 = get_local_id(0);
     const int la1 = get_local_id(1);
@@ -196,9 +196,9 @@ inline void GlobalToLocalScalarA(const __global real* restrict agms, __local rea
 }
 
 // Same as above, but now for the B input matrix
-inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
-                                 const int b_ld, const int b_offset, const int kwg,
-                                 const int b_transpose, const int b_conjugate) {
+INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
+                                      const int b_ld, const int b_offset, const int kwg,
+                                      const int b_transpose, const int b_conjugate) {
   #if MDIMCD == NDIMBD
     const int lb0 = get_local_id(0);
     const int lb1 = get_local_id(1);
@@ -231,10 +231,10 @@ inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local rea
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix. In contrast to the functions above, this function performs bounds
 // checks and doesn't use the vector data-types.
-inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
-                                  const int a_ld, const int a_offset, const int kwg,
-                                  const int a_transpose, const int a_conjugate,
-                                  const int kSizeM, const int kSizeK) {
+INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
+                                       const int a_ld, const int a_offset, const int kwg,
+                                       const int a_transpose, const int a_conjugate,
+                                       const int kSizeM, const int kSizeK) {
   #if MDIMCD == MDIMAD
     const int la0 = get_local_id(0);
     const int la1 = get_local_id(1);
@@ -270,10 +270,10 @@ inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local re
 }
 
 // Same as above, but now for the B input matrix
-inline void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
-                                  const int b_ld, const int b_offset, const int kwg,
-                                  const int b_transpose, const int b_conjugate,
-                                  const int kSizeN, const int kSizeK) {
+INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
+                                       const int b_ld, const int b_offset, const int kwg,
+                                       const int b_transpose, const int b_conjugate,
+                                       const int kSizeN, const int kSizeK) {
   #if MDIMCD == NDIMBD
     const int lb0 = get_local_id(0);
     const int lb1 = get_local_id(1);
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
index c04cdeb8..b0beb614 100644
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -18,15 +18,15 @@ R"(
 // =================================================================================================
 
 // Main body of the kernel. This is the direct version without pre/post processing and restrictions.
-inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
-                        const real_arg arg_alpha,
-                        const real_arg arg_beta,
-                        const __global realMD* restrict agm, const int a_offset, const int a_ld,
-                        const __global realND* restrict bgm, const int b_offset, const int b_ld,
-                        __global real* cgm, const int c_offset, const int c_ld,
-                        __local real* alm, __local real* blm,
-                        const int a_transpose, const int b_transpose, const int c_transpose,
-                        const int a_conjugate, const int b_conjugate) {
+INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
+                             const real_arg arg_alpha,
+                             const real_arg arg_beta,
+                             const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                             const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                             __global real* cgm, const int c_offset, const int c_ld,
+                             __local real* alm, __local real* blm,
+                             const int a_transpose, const int b_transpose, const int c_transpose,
+                             const int a_conjugate, const int b_conjugate) {
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);
 
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index d0ce06ad..07dafe13 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -135,7 +135,7 @@ R"(
 // =================================================================================================
 
 // Initializes the accumulation registers to zero
-inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
+INLINE_FUNC void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
   #pragma unroll
   for (int mi=0; mi<MWI/VWM; ++mi) {
     #pragma unroll
@@ -186,8 +186,8 @@ inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
 #if SA == 1
-inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
-                           const int kSizeM, const int tid, const int kwg) {
+INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
+                                const int kSizeM, const int tid, const int kwg) {
   const int la0 = tid % MDIMA;
   const int la1 = tid / MDIMA;
   #pragma unroll
@@ -216,8 +216,8 @@ inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* al
 
 // Same as above, but now for the B input matrix
 #if SB == 1
-inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
-                           const int kSizeN, const int tid, const int kwg) {
+INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
+                                const int kSizeN, const int tid, const int kwg) {
   const int lb0 = tid % NDIMB;
   const int lb1 = tid / NDIMB;
   #pragma unroll
@@ -249,8 +249,8 @@ inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* bl
 // Caches global off-chip memory directly into per-thread private memory (registers). This function
 // is specific for caching the A input matrix.
 #if SA == 0
-inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
-                             const int kSizeM, const int idk, const int kwg) {
+INLINE_FUNC void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
+                                  const int kSizeM, const int idk, const int kwg) {
   #pragma unroll
   for (int mi=0; mi<MWI/VWM; ++mi) {
 
@@ -272,8 +272,8 @@ inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/V
 
 // Same as above, but now for the B input matrix
 #if SB == 0
-inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
-                             const int kSizeN, const int idk) {
+INLINE_FUNC void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
+                                  const int kSizeN, const int idk) {
   #pragma unroll
   for (int ni=0; ni<NWI/VWN; ++ni) {
 
@@ -298,7 +298,7 @@ inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/V
 // Caches on-chip local memory into per-thread private memory (registers). This function is specific
 // for caching the A input matrix.
 #if SA == 1
-inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
+INLINE_FUNC void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
   #pragma unroll
   for (int mi=0; mi<MWI/VWM; ++mi) {
     #if STRM == 0
@@ -313,7 +313,7 @@ inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg
 
 // Same as above, but now for the B input matrix
 #if SB == 1
-inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
+INLINE_FUNC void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
   #pragma unroll
   for (int ni=0; ni<NWI/VWN; ++ni) {
     #if STRN == 0
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index e8234a29..06fafc8f 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -18,7 +18,7 @@ R"(
 // =================================================================================================
 
 // The vectorised multiply-add function
-inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
+INLINE_FUNC realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
   #if USE_VECTOR_MAD == 1
     cvec += avec * bval;
   #else
@@ -64,7 +64,7 @@ inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
 }
 
 // Performs the actual computation: Cpm += Apm * Bpm
-inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) {
+INLINE_FUNC void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) {
   #pragma unroll
   for (int ni=0; ni<NWI/VWN; ++ni) {
     #pragma unroll
@@ -115,8 +115,8 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real
 
 // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
 // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
-inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
-                         const real alpha, const real beta) {
+INLINE_FUNC void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
+                              const real alpha, const real beta) {
   #pragma unroll
   for (int ni=0; ni<NWI; ++ni) {
     #pragma unroll
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index 8ac3a3a8..3f0d590d 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -18,17 +18,17 @@ R"(
 // =================================================================================================
 
 // Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
-inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
-                      const __global realM* restrict agm, const __global realN* restrict bgm,
-                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
-                      #if SA == 1 && SB == 1
-                        , __local realM* alm, __local realN* blm
-                      #elif SA == 1
-                        , __local realM* alm
-                      #elif SB == 1
-                        , __local realN* blm
-                      #endif
-                      ) {
+INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+                           const __global realM* restrict agm, const __global realN* restrict bgm,
+                           __global realM* cgm, realM cpm[NWI][MWI/VWM]
+                           #if SA == 1 && SB == 1
+                             , __local realM* alm, __local realN* blm
+                           #elif SA == 1
+                             , __local realM* alm
+                           #elif SB == 1
+                             , __local realN* blm
+                           #endif
+                           ) {
 
   // Allocates workitem-private memory (registers)
   realM apm[MWI/VWM];
diff --git a/src/routine.cpp b/src/routine.cpp
index 81baa590..7d4ed76f 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -135,7 +135,13 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
   // Adds the name of the routine as a define
   source_string += "#define ROUTINE_"+routine_name_+"\n";
 
-  // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
+  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
+  // which it is known to work with all OpenCL platforms.
+  if (device_.IsNVIDIA() || device_.IsARM()) {
+    source_string += "#define USE_INLINE_KEYWORD 1\n";
+  }
+
+  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
   // performance, but might result in a reduced accuracy.
   if (device_.IsAMD() && device_.IsGPU()) {
     source_string += "#define USE_CL_MAD 1\n";
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index c995dc12..5b178e53 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -73,4 +73,79 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 }
 
 // =================================================================================================
+
+// Sets all elements of a matrix to a constant value
+template <typename T>
+void FillMatrix(Queue &queue, const Device &device,
+                const Program &program, const Databases &,
+                EventPointer event, const std::vector<Event> &waitForEvents,
+                const size_t m, const size_t n, const size_t ld, const size_t offset,
+                const Buffer<T> &dest,
+                const T constant_value) {
+  auto kernel = Kernel(program, "FillMatrix");
+  kernel.SetArgument(0, static_cast<int>(m));
+  kernel.SetArgument(1, static_cast<int>(n));
+  kernel.SetArgument(2, static_cast<int>(ld));
+  kernel.SetArgument(3, static_cast<int>(offset));
+  kernel.SetArgument(4, dest());
+  kernel.SetArgument(5, GetRealArg(constant_value));
+  auto local = std::vector<size_t>{8, 8};
+  auto global = std::vector<size_t>{Ceil(m, 8), Ceil(n, 8)};
+  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
+// Compiles the above function
+template void FillMatrix<half>(Queue&, const Device&, const Program&, const Databases&,
+                               EventPointer, const std::vector<Event>&, const size_t, const size_t,
+                               const size_t, const size_t, const Buffer<half>&, const half);
+template void FillMatrix<float>(Queue&, const Device&, const Program&, const Databases&,
+                                EventPointer, const std::vector<Event>&, const size_t, const size_t,
+                                const size_t, const size_t, const Buffer<float>&, const float);
+template void FillMatrix<double>(Queue&, const Device&, const Program&, const Databases&,
+                                 EventPointer, const std::vector<Event>&, const size_t, const size_t,
+                                 const size_t, const size_t, const Buffer<double>&, const double);
+template void FillMatrix<float2>(Queue&, const Device&, const Program&, const Databases&,
+                                 EventPointer, const std::vector<Event>&, const size_t, const size_t,
+                                 const size_t, const size_t, const Buffer<float2>&, const float2);
+template void FillMatrix<double2>(Queue&, const Device&, const Program&, const Databases&,
+                                  EventPointer, const std::vector<Event>&, const size_t, const size_t,
+                                  const size_t, const size_t, const Buffer<double2>&, const double2);
+
+// Sets all elements of a vector to a constant value
+template <typename T>
+void FillVector(Queue &queue, const Device &device,
+                const Program &program, const Databases &,
+                EventPointer event, const std::vector<Event> &waitForEvents,
+                const size_t n, const size_t inc, const size_t offset,
+                const Buffer<T> &dest,
+                const T constant_value) {
+  auto kernel = Kernel(program, "FillVector");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, static_cast<int>(inc));
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, dest());
+  kernel.SetArgument(4, GetRealArg(constant_value));
+  auto local = std::vector<size_t>{64};
+  auto global = std::vector<size_t>{Ceil(n, 64)};
+  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
+// Compiles the above function
+template void FillVector<half>(Queue&, const Device&, const Program&, const Databases&,
+                               EventPointer, const std::vector<Event>&, const size_t, const size_t,
+                               const size_t, const Buffer<half>&, const half);
+template void FillVector<float>(Queue&, const Device&, const Program&, const Databases&,
+                                EventPointer, const std::vector<Event>&, const size_t, const size_t,
+                                const size_t, const Buffer<float>&, const float);
+template void FillVector<double>(Queue&, const Device&, const Program&, const Databases&,
+                                 EventPointer, const std::vector<Event>&, const size_t, const size_t,
+                                 const size_t, const Buffer<double>&, const double);
+template void FillVector<float2>(Queue&, const Device&, const Program&, const Databases&,
+                                 EventPointer, const std::vector<Event>&, const size_t, const size_t,
+                                 const size_t, const Buffer<float2>&, const float2);
+template void FillVector<double2>(Queue&, const Device&, const Program&, const Databases&,
+                                  EventPointer, const std::vector<Event>&, const size_t, const size_t,
+                                  const size_t, const Buffer<double2>&, const double2);
+
+// =================================================================================================
 } // namespace clblast
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 28a43da5..84ccd9d2 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -40,18 +40,7 @@ void FillMatrix(Queue &queue, const Device &device,
                 EventPointer event, const std::vector<Event> &waitForEvents,
                 const size_t m, const size_t n, const size_t ld, const size_t offset,
                 const Buffer<T> &dest,
-                const T constant_value) {
-  auto kernel = Kernel(program, "FillMatrix");
-  kernel.SetArgument(0, static_cast<int>(m));
-  kernel.SetArgument(1, static_cast<int>(n));
-  kernel.SetArgument(2, static_cast<int>(ld));
-  kernel.SetArgument(3, static_cast<int>(offset));
-  kernel.SetArgument(4, dest());
-  kernel.SetArgument(5, GetRealArg(constant_value));
-  auto local = std::vector<size_t>{8, 8};
-  auto global = std::vector<size_t>{Ceil(m, 8), Ceil(n, 8)};
-  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-}
+                const T constant_value);
 
 // Sets all elements of a vector to a constant value
 template <typename T>
@@ -60,17 +49,7 @@ void FillVector(Queue &queue, const Device &device,
                 EventPointer event, const std::vector<Event> &waitForEvents,
                 const size_t n, const size_t inc, const size_t offset,
                 const Buffer<T> &dest,
-                const T constant_value) {
-  auto kernel = Kernel(program, "FillVector");
-  kernel.SetArgument(0, static_cast<int>(n));
-  kernel.SetArgument(1, static_cast<int>(inc));
-  kernel.SetArgument(2, static_cast<int>(offset));
-  kernel.SetArgument(3, dest());
-  kernel.SetArgument(4, GetRealArg(constant_value));
-  auto local = std::vector<size_t>{64};
-  auto global = std::vector<size_t>{Ceil(n, 64)};
-  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-}
+                const T constant_value);
 
 // =================================================================================================
 
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 30e5999c..136eec43 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -283,8 +283,10 @@ void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
   const auto m_ceiled = Ceil(m, db_["WGD"]);
   const auto n_ceiled = Ceil(n, db_["WGD"]);
   const auto global = std::vector<size_t>{
-    (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
-    (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
+  //  CeilDiv(m * db_["MDIMCD"], db_["WGD"]),
+  //  CeilDiv(n * db_["NDIMCD"], db_["WGD"])
+      (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
+      (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
   };
   const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
 
diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp
index 0fea1922..ee8448d2 100644
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@@ -94,8 +94,8 @@ void XgemmBatched<T>::DoGemmBatched(const Layout layout, const Transpose a_trans
 
   // Tests the matrices for validity
   for (auto batch = size_t{0}; batch < batch_count; ++batch) {
-    TestMatrixA(a_one, a_two, a_buffer, a_offsets[batch], a_ld);
-    TestMatrixB(b_one, b_two, b_buffer, b_offsets[batch], b_ld);
+    TestMatrixA(a_one, a_two, a_buffer, a_offsets[batch], a_ld, false); // don't test for invalid LD
+    TestMatrixB(b_one, b_two, b_buffer, b_offsets[batch], b_ld, false); // don't test for invalid LD
     TestMatrixC(c_one, c_two, c_buffer, c_offsets[batch], c_ld);
   }
 
diff --git a/src/utilities/buffer_test.hpp b/src/utilities/buffer_test.hpp
index 652ab8c6..b5693181 100644
--- a/src/utilities/buffer_test.hpp
+++ b/src/utilities/buffer_test.hpp
@@ -23,8 +23,8 @@ namespace clblast {
 // Tests matrix 'A' for validity
 template <typename T>
 void TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
-                 const size_t offset, const size_t ld) {
-  if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimA); }
+                 const size_t offset, const size_t ld, const bool test_lead_dim = true) {
+  if (test_lead_dim && ld < one) { throw BLASError(StatusCode::kInvalidLeadDimA); }
   try {
     const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
     if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); }
@@ -34,8 +34,8 @@ void TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
 // Tests matrix 'B' for validity
 template <typename T>
 void TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
-                 const size_t offset, const size_t ld) {
-  if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimB); }
+                 const size_t offset, const size_t ld, const bool test_lead_dim = true) {
+  if (test_lead_dim && ld < one) { throw BLASError(StatusCode::kInvalidLeadDimB); }
   try {
     const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
     if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryB); }
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index 95b70cd5..0cd00438 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the common (test) utility functions.
+// This file implements the common utility functions.
 //
 // =================================================================================================
 
@@ -85,14 +85,6 @@ template <> double AbsoluteValue(const double2 value) {
   return std::sqrt(value.real() * value.real() + value.imag() * value.imag());
 }
 
-// Returns whether a scalar is close to zero
-template <typename T> bool IsCloseToZero(const T value) { return (value > -SmallConstant<T>()) && (value < SmallConstant<T>()); }
-template bool IsCloseToZero<float>(const float);
-template bool IsCloseToZero<double>(const double);
-template <> bool IsCloseToZero(const half value) { return IsCloseToZero(HalfToFloat(value)); }
-template <> bool IsCloseToZero(const float2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
-template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
-
 // =================================================================================================
 
 // Implements the string conversion using std::to_string if possible
@@ -319,12 +311,6 @@ bool CheckArgument(const std::vector<std::string> &arguments, std::string &help,
 
 // =================================================================================================
 
-// Returns a random seed. This used to be implemented using 'std::random_device', but that doesn't
-// always work. The chrono-timers are more reliable in that sense, but perhaps less random.
-unsigned int GetRandomSeed() {
-  return static_cast<unsigned int>(std::chrono::system_clock::now().time_since_epoch().count());
-}
-
 // Create a random number generator and populates a vector with samples from a random distribution
 template <typename T>
 void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) {
@@ -354,87 +340,6 @@ void PopulateVector(std::vector<half> &vector, std::mt19937 &mt, std::uniform_re
 
 // =================================================================================================
 
-template <typename T, typename U>
-void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
-                  Queue &queue, const std::vector<std::string> &names) {
-  for (auto &name: names) {
-    if (name == kBufVecX) {buffers_host.x_vec = std::vector<T>(args.x_size, static_cast<T>(0)); buffers.x_vec.Read(queue, args.x_size, buffers_host.x_vec); }
-    else if (name == kBufVecY) { buffers_host.y_vec = std::vector<T>(args.y_size, static_cast<T>(0)); buffers.y_vec.Read(queue, args.y_size, buffers_host.y_vec); }
-    else if (name == kBufMatA) { buffers_host.a_mat = std::vector<T>(args.a_size, static_cast<T>(0)); buffers.a_mat.Read(queue, args.a_size, buffers_host.a_mat); }
-    else if (name == kBufMatB) { buffers_host.b_mat = std::vector<T>(args.b_size, static_cast<T>(0)); buffers.b_mat.Read(queue, args.b_size, buffers_host.b_mat); }
-    else if (name == kBufMatC) { buffers_host.c_mat = std::vector<T>(args.c_size, static_cast<T>(0)); buffers.c_mat.Read(queue, args.c_size, buffers_host.c_mat); }
-    else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector<T>(args.ap_size, static_cast<T>(0)); buffers.ap_mat.Read(queue, args.ap_size, buffers_host.ap_mat); }
-    else if (name == kBufScalar) { buffers_host.scalar = std::vector<T>(args.scalar_size, static_cast<T>(0)); buffers.scalar.Read(queue, args.scalar_size, buffers_host.scalar); }
-    else { throw std::runtime_error("Invalid buffer name"); }
-  }
-}
-
-template <typename T, typename U>
-void HostToDevice(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
-                  Queue &queue, const std::vector<std::string> &names) {
-  for (auto &name: names) {
-    if (name == kBufVecX) { buffers.x_vec.Write(queue, args.x_size, buffers_host.x_vec); }
-    else if (name == kBufVecY) { buffers.y_vec.Write(queue, args.y_size, buffers_host.y_vec); }
-    else if (name == kBufMatA) { buffers.a_mat.Write(queue, args.a_size, buffers_host.a_mat); }
-    else if (name == kBufMatB) { buffers.b_mat.Write(queue, args.b_size, buffers_host.b_mat); }
-    else if (name == kBufMatC) { buffers.c_mat.Write(queue, args.c_size, buffers_host.c_mat); }
-    else if (name == kBufMatAP) { buffers.ap_mat.Write(queue, args.ap_size, buffers_host.ap_mat); }
-    else if (name == kBufScalar) { buffers.scalar.Write(queue, args.scalar_size, buffers_host.scalar); }
-    else { throw std::runtime_error("Invalid buffer name"); }
-  }
-}
-
-// Compiles the above functions
-template void DeviceToHost(const Arguments<half>&, Buffers<half>&, BuffersHost<half>&, Queue&, const std::vector<std::string>&);
-template void DeviceToHost(const Arguments<float>&, Buffers<float>&, BuffersHost<float>&, Queue&, const std::vector<std::string>&);
-template void DeviceToHost(const Arguments<double>&, Buffers<double>&, BuffersHost<double>&, Queue&, const std::vector<std::string>&);
-template void DeviceToHost(const Arguments<float>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
-template void DeviceToHost(const Arguments<double>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
-template void DeviceToHost(const Arguments<float2>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
-template void DeviceToHost(const Arguments<double2>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
-template void HostToDevice(const Arguments<half>&, Buffers<half>&, BuffersHost<half>&, Queue&, const std::vector<std::string>&);
-template void HostToDevice(const Arguments<float>&, Buffers<float>&, BuffersHost<float>&, Queue&, const std::vector<std::string>&);
-template void HostToDevice(const Arguments<double>&, Buffers<double>&, BuffersHost<double>&, Queue&, const std::vector<std::string>&);
-template void HostToDevice(const Arguments<float>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
-template void HostToDevice(const Arguments<double>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
-template void HostToDevice(const Arguments<float2>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
-template void HostToDevice(const Arguments<double2>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
-
-// =================================================================================================
-
-// Conversion between half and single-precision
-std::vector<float> HalfToFloatBuffer(const std::vector<half>& source) {
-  auto result = std::vector<float>(source.size());
-  for (auto i = size_t(0); i < source.size(); ++i) { result[i] = HalfToFloat(source[i]); }
-  return result;
-}
-void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source) {
-  for (auto i = size_t(0); i < source.size(); ++i) { result[i] = FloatToHalf(source[i]); }
-}
-
-// As above, but now for OpenCL data-types instead of std::vectors
-Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw) {
-  const auto size = source.GetSize() / sizeof(half);
-  auto queue = Queue(queue_raw);
-  auto context = queue.GetContext();
-  auto source_cpu = std::vector<half>(size);
-  source.Read(queue, size, source_cpu);
-  auto result_cpu = HalfToFloatBuffer(source_cpu);
-  auto result = Buffer<float>(context, size);
-  result.Write(queue, size, result_cpu);
-  return result;
-}
-void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw) {
-  const auto size = source.GetSize() / sizeof(float);
-  auto queue = Queue(queue_raw);
-  auto context = queue.GetContext();
-  auto source_cpu = std::vector<float>(size);
-  source.Read(queue, size, source_cpu);
-  auto result_cpu = std::vector<half>(size);
-  FloatToHalfBuffer(result_cpu, source_cpu);
-  result.Write(queue, size, result_cpu);
-}
-
 // Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
 // no conversion, but half-precision is not supported as kernel argument so it is converted to float.
 template <> typename RealArg<half>::Type GetRealArg(const half value) { return HalfToFloat(value); }
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 8daeda08..784e0324 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -7,10 +7,9 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file provides declarations for the common (test) utility functions such as a command-line
+// This file provides declarations for the common utility functions such as a command-line
 // argument parser. On top of this, it serves as the 'common' header, including the C++ OpenCL
-// wrapper. These utilities are not only used for CLBlast, but also included as part of the tuners,
-// the performance client and the correctness testers.
+// wrapper.
 //
 // =================================================================================================
 
@@ -94,19 +93,6 @@ constexpr auto kArgDilationW = "dilationw";
 // The tuner-specific arguments in string form
 constexpr auto kArgFraction = "fraction";
 
-// The client-specific arguments in string form
-constexpr auto kArgCompareclblas = "clblas";
-constexpr auto kArgComparecblas = "cblas";
-constexpr auto kArgComparecublas = "cublas";
-constexpr auto kArgStepSize = "step";
-constexpr auto kArgNumSteps = "num_steps";
-constexpr auto kArgNumRuns = "runs";
-constexpr auto kArgWarmUp = "warm_up";
-
-// The test-specific arguments in string form
-constexpr auto kArgFullTest = "full_test";
-constexpr auto kArgVerbose = "verbose";
-
 // The common arguments in string form
 constexpr auto kArgPlatform = "platform";
 constexpr auto kArgDevice = "device";
@@ -114,6 +100,7 @@ constexpr auto kArgPrecision = "precision";
 constexpr auto kArgHelp = "h";
 constexpr auto kArgQuiet = "q";
 constexpr auto kArgNoAbbreviations = "no_abbrv";
+constexpr auto kArgNumRuns = "runs";
 
 // The buffer names
 constexpr auto kBufVecX = "X";
@@ -146,9 +133,6 @@ template <typename T> T SmallConstant();
 // Returns the absolute value of a scalar (modulus in case of complex numbers)
 template <typename T> typename BaseType<T>::Type AbsoluteValue(const T value);
 
-// Returns whether a scalar is close to zero
-template <typename T> bool IsCloseToZero(const T value);
-
 // =================================================================================================
 
 // Structure containing all possible arguments for test clients, including their default values
@@ -233,28 +217,6 @@ struct Arguments {
   bool no_abbrv = false;
 };
 
-// Structure containing all possible buffers for test clients
-template <typename T>
-struct Buffers {
-  Buffer<T> x_vec;
-  Buffer<T> y_vec;
-  Buffer<T> a_mat;
-  Buffer<T> b_mat;
-  Buffer<T> c_mat;
-  Buffer<T> ap_mat;
-  Buffer<T> scalar;
-};
-template <typename T>
-struct BuffersHost {
-  std::vector<T> x_vec;
-  std::vector<T> y_vec;
-  std::vector<T> a_mat;
-  std::vector<T> b_mat;
-  std::vector<T> c_mat;
-  std::vector<T> ap_mat;
-  std::vector<T> scalar;
-};
-
 // =================================================================================================
 
 // Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
@@ -289,9 +251,6 @@ bool CheckArgument(const std::vector<std::string> &arguments, std::string &help,
 
 // =================================================================================================
 
-// Returns a random number to be used as a seed
-unsigned int GetRandomSeed();
-
 // Test/example data lower and upper limit
 constexpr auto kTestDataLowerLimit = -2.0;
 constexpr auto kTestDataUpperLimit = 2.0;
@@ -302,26 +261,6 @@ void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_
 
 // =================================================================================================
 
-// Copies buffers from the OpenCL device to the host
-template <typename T, typename U>
-void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
-                  Queue &queue, const std::vector<std::string> &names);
-
-// Copies buffers from the host to the OpenCL device
-template <typename T, typename U>
-void HostToDevice(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
-                  Queue &queue, const std::vector<std::string> &names);
-
-// =================================================================================================
-
-// Conversion between half and single-precision
-std::vector<float> HalfToFloatBuffer(const std::vector<half>& source);
-void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source);
-
-// As above, but now for OpenCL data-types instead of std::vectors
-Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw);
-void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw);
-
 // Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
 // no conversion, but half-precision is not supported as kernel argument so it is converted to float.
 template <typename T> struct RealArg { using Type = T; };
diff --git a/test/correctness/misc/override_parameters.cpp b/test/correctness/misc/override_parameters.cpp
index 4283c039..535d9286 100644
--- a/test/correctness/misc/override_parameters.cpp
+++ b/test/correctness/misc/override_parameters.cpp
@@ -15,6 +15,7 @@
 #include <vector>
 #include <unordered_map>
 #include <random>
+#include <iostream>
 
 #include "utilities/utilities.hpp"
 #include "test/routines/level3/xgemm.hpp"
@@ -120,9 +121,9 @@ size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::st
   }
 
   // Prints and returns the statistics
-  fprintf(stdout, "    %zu test(s) passed\n", passed);
-  fprintf(stdout, "    %zu test(s) failed\n", errors);
-  fprintf(stdout, "\n");
+  std::cout << "    " << passed << " test(s) passed" << std::endl;
+  std::cout << "    " << errors << " test(s) failed" << std::endl;
+  std::cout << std::endl;
   return errors;
 }
 
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index edd9d4ea..659131c5 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -198,12 +198,12 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
         if (!TestSimilarity(result1[index], result2[index])) {
           if (l2error >= kErrorMarginL2) { errors++; }
           if (verbose_) {
-            if (get_id2_(args) == 1) { fprintf(stdout, "\n   Error at index %zu: ", id1); }
-            else { fprintf(stdout, "\n   Error at %zu,%zu: ", id1, id2); }
-            fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str());
-            fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str());
+            if (get_id2_(args) == 1) { std::cout << std::endl << "   Error at index " << id1 << ": "; }
+            else { std::cout << std::endl << "   Error at " << id1 << "," << id2 << ": "; }
+            std::cout << " " << ToString(result1[index]) << " (reference) versus ";
+            std::cout << " " << ToString(result2[index]) << " (CLBlast)";
             if (l2error < kErrorMarginL2) {
-              fprintf(stdout, " - error suppressed by a low total L2 error\n");
+              std::cout << " - error suppressed by a low total L2 error" << std::endl;
             }
           }
         }
diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp
index 577a289e..1c0cf9e3 100644
--- a/test/correctness/testblas.hpp
+++ b/test/correctness/testblas.hpp
@@ -391,6 +391,12 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
                 auto i_args = args;
                 i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
                 i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize;
+                i_args.batch_count = 3;
+                i_args.alphas = std::vector<U>(i_args.batch_count);
+                i_args.betas = std::vector<U>(i_args.batch_count);
+                i_args.a_offsets = std::vector<size_t>(i_args.batch_count);
+                i_args.b_offsets = std::vector<size_t>(i_args.batch_count);
+                i_args.c_offsets = std::vector<size_t>(i_args.batch_count);
                 for (auto &x_size: x_sizes) { i_args.x_size = x_size;
                   for (auto &y_size: y_sizes) { i_args.y_size = y_size;
                     for (auto &a_size: a_sizes) { i_args.a_size = a_size;
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index 1581fbfb..648aef6e 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -185,14 +185,14 @@ Tester<T,U>::Tester(const std::vector<std::string> &arguments, const bool silent
 template <typename T, typename U>
 Tester<T,U>::~Tester() {
   if (PrecisionSupported<T>(device_)) {
-    fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
-    fprintf(stdout, "   %zu test(s) passed\n", tests_passed_);
-    if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
-    fprintf(stdout, "   %zu test(s) skipped%s\n", tests_skipped_, kPrintEnd.c_str());
-    if (tests_failed_ > 0) { fprintf(stdout, "%s", kPrintError.c_str()); }
-    fprintf(stdout, "   %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
+    std::cout << "* Completed all test-cases for this routine. Results:" << std::endl;
+    std::cout << "   " << tests_passed_ << " test(s) passed" << std::endl;
+    if (tests_skipped_ > 0) { std::cout << kPrintWarning; }
+    std::cout << "   " << tests_skipped_ << " test(s) skipped" << kPrintEnd << std::endl;
+    if (tests_failed_ > 0) { std::cout << kPrintError; }
+    std::cout << "   " << tests_failed_ << " test(s) failed" << kPrintEnd << std::endl;
   }
-  fprintf(stdout, "\n");
+  std::cout << std::endl;
 
   // Cleans-up clBLAS
   #ifdef CLBLAST_REF_CLBLAS
@@ -238,18 +238,18 @@ void Tester<T,U>::TestEnd() {
   // Prints a test summary
   auto pass_rate = 100*num_passed_ / static_cast<float>(num_passed_ + num_skipped_ + num_failed_);
   fprintf(stdout, "   Pass rate %s%5.1lf%%%s:", kPrintMessage.c_str(), pass_rate, kPrintEnd.c_str());
-  fprintf(stdout, " %zu passed /", num_passed_);
+  std::cout << " " << num_passed_ << " passed /";
   if (num_skipped_ != 0) {
-    fprintf(stdout, " %s%zu skipped%s /", kPrintWarning.c_str(), num_skipped_, kPrintEnd.c_str());
+    std::cout << " " << kPrintWarning << num_skipped_ << " skipped" << kPrintEnd << " /";
   }
   else {
-    fprintf(stdout, " %zu skipped /", num_skipped_);
+    std::cout << " " << num_skipped_ << " skipped /";
   }
   if (num_failed_ != 0) {
-    fprintf(stdout, " %s%zu failed%s\n", kPrintError.c_str(), num_failed_, kPrintEnd.c_str());
+    std::cout << " " << kPrintError << num_failed_ << " failed" << kPrintEnd << std::endl;
   }
   else {
-    fprintf(stdout, " %zu failed\n", num_failed_);
+    std::cout << " " << num_failed_ << " failed" << std::endl;
   }
 }
 
diff --git a/test/correctness/tester.hpp b/test/correctness/tester.hpp
index e544f776..caf03787 100644
--- a/test/correctness/tester.hpp
+++ b/test/correctness/tester.hpp
@@ -22,7 +22,7 @@
 #include <vector>
 #include <memory>
 
-#include "utilities/utilities.hpp"
+#include "test/test_utilities.hpp"
 
 // The libraries
 #ifdef CLBLAST_REF_CLBLAS
@@ -153,20 +153,38 @@ template <typename T, typename U> const size_t Tester<T,U>::kResultsPerLine = si
 template <typename T, typename U> const float Tester<T,U>::kStatusError = -1.0f;
 
 // Constants holding start and end strings for terminal-output in colour
-template <typename T, typename U> const std::string Tester<T,U>::kPrintError = "\x1b[31m";
-template <typename T, typename U> const std::string Tester<T,U>::kPrintSuccess = "\x1b[32m";
-template <typename T, typename U> const std::string Tester<T,U>::kPrintWarning = "\x1b[35m";
-template <typename T, typename U> const std::string Tester<T,U>::kPrintMessage = "\x1b[1m";
-template <typename T, typename U> const std::string Tester<T,U>::kPrintEnd = "\x1b[0m";
+#if defined(_WIN32)
+  template <typename T, typename U> const std::string Tester<T,U>::kPrintError = "";
+  template <typename T, typename U> const std::string Tester<T,U>::kPrintSuccess = "";
+  template <typename T, typename U> const std::string Tester<T,U>::kPrintWarning = "";
+  template <typename T, typename U> const std::string Tester<T,U>::kPrintMessage = "";
+  template <typename T, typename U> const std::string Tester<T,U>::kPrintEnd = "";
+#else
+  template <typename T, typename U> const std::string Tester<T,U>::kPrintError = "\x1b[31m";
+  template <typename T, typename U> const std::string Tester<T,U>::kPrintSuccess = "\x1b[32m";
+  template <typename T, typename U> const std::string Tester<T,U>::kPrintWarning = "\x1b[35m";
+  template <typename T, typename U> const std::string Tester<T,U>::kPrintMessage = "\x1b[1m";
+  template <typename T, typename U> const std::string Tester<T,U>::kPrintEnd = "\x1b[0m";
+#endif
 
 // Sets the output error coding
-template <typename T, typename U> const std::string Tester<T,U>::kSuccessData = "\x1b[32m:\x1b[0m"; // success
-template <typename T, typename U> const std::string Tester<T,U>::kSuccessStatus = "\x1b[32m.\x1b[0m"; // success
-template <typename T, typename U> const std::string Tester<T,U>::kErrorData = "\x1b[31mX\x1b[0m"; // error
-template <typename T, typename U> const std::string Tester<T,U>::kErrorStatus = "\x1b[31m/\x1b[0m"; // error
-template <typename T, typename U> const std::string Tester<T,U>::kSkippedCompilation = "\x1b[35m\\\x1b[0m"; // warning
-template <typename T, typename U> const std::string Tester<T,U>::kUnsupportedPrecision = "\x1b[35mo\x1b[0m"; // warning
-template <typename T, typename U> const std::string Tester<T,U>::kUnsupportedReference = "\x1b[35m-\x1b[0m"; // warning
+#if defined(_WIN32)
+  template <typename T, typename U> const std::string Tester<T,U>::kSuccessData = ":"; // success
+  template <typename T, typename U> const std::string Tester<T,U>::kSuccessStatus = "."; // success
+  template <typename T, typename U> const std::string Tester<T,U>::kErrorData = "X"; // error
+  template <typename T, typename U> const std::string Tester<T,U>::kErrorStatus = "/"; // error
+  template <typename T, typename U> const std::string Tester<T,U>::kSkippedCompilation = "\\"; // warning
+  template <typename T, typename U> const std::string Tester<T,U>::kUnsupportedPrecision = "o"; // warning
+  template <typename T, typename U> const std::string Tester<T,U>::kUnsupportedReference = "-"; // warning
+#else
+  template <typename T, typename U> const std::string Tester<T,U>::kSuccessData = "\x1b[32m:\x1b[0m"; // success
+  template <typename T, typename U> const std::string Tester<T,U>::kSuccessStatus = "\x1b[32m.\x1b[0m"; // success
+  template <typename T, typename U> const std::string Tester<T,U>::kErrorData = "\x1b[31mX\x1b[0m"; // error
+  template <typename T, typename U> const std::string Tester<T,U>::kErrorStatus = "\x1b[31m/\x1b[0m"; // error
+  template <typename T, typename U> const std::string Tester<T,U>::kSkippedCompilation = "\x1b[35m\\\x1b[0m"; // warning
+  template <typename T, typename U> const std::string Tester<T,U>::kUnsupportedPrecision = "\x1b[35mo\x1b[0m"; // warning
+  template <typename T, typename U> const std::string Tester<T,U>::kUnsupportedReference = "\x1b[35m-\x1b[0m"; // warning
+#endif
 
 // =================================================================================================
 // Below are the non-member functions (separated because of otherwise required partial class
diff --git a/test/performance/client.hpp b/test/performance/client.hpp
index 47a13017..2ba09cb9 100644
--- a/test/performance/client.hpp
+++ b/test/performance/client.hpp
@@ -25,7 +25,7 @@
 #include <vector>
 #include <utility>
 
-#include "utilities/utilities.hpp"
+#include "test/test_utilities.hpp"
 
 // The libraries to test
 #ifdef CLBLAST_REF_CLBLAS
diff --git a/test/routines/common.hpp b/test/routines/common.hpp
index 9708288a..47c8f8d7 100644
--- a/test/routines/common.hpp
+++ b/test/routines/common.hpp
@@ -18,6 +18,7 @@
 #include <string>
 
 #include "utilities/utilities.hpp"
+#include "test/test_utilities.hpp"
 
 #ifdef CLBLAST_REF_CLBLAS
   #include "test/wrapper_clblas.hpp"
diff --git a/test/routines/levelx/xgemmbatched.hpp b/test/routines/levelx/xgemmbatched.hpp
index 56823e47..704d0578 100644
--- a/test/routines/levelx/xgemmbatched.hpp
+++ b/test/routines/levelx/xgemmbatched.hpp
@@ -110,6 +110,15 @@ class TestXgemmBatched {
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
+    // Relaxed requirement on ld_a and ld_b within the library, this is here to match clBLAS
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+    auto a_one = (!a_rotated) ? args.m : args.k;
+    auto b_one = (!b_rotated) ? args.k : args.n;
+    if (args.a_ld < a_one) { return StatusCode::kInvalidLeadDimA; }
+    if (args.b_ld < b_one) { return StatusCode::kInvalidLeadDimB; }
     auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose,
                               args.m, args.n, args.k, args.alphas.data(),
                               buffers.a_mat(), args.a_offsets.data(), args.a_ld,
diff --git a/test/test_utilities.cpp b/test/test_utilities.cpp
new file mode 100644
index 00000000..b8fd94a9
--- /dev/null
+++ b/test/test_utilities.cpp
@@ -0,0 +1,114 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the test utility functions.
+//
+// =================================================================================================
+
+#include "test/test_utilities.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Returns whether a scalar is close to zero
+template <typename T> bool IsCloseToZero(const T value) { return (value > -SmallConstant<T>()) && (value < SmallConstant<T>()); }
+template bool IsCloseToZero<float>(const float);
+template bool IsCloseToZero<double>(const double);
+template <> bool IsCloseToZero(const half value) { return IsCloseToZero(HalfToFloat(value)); }
+template <> bool IsCloseToZero(const float2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
+template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
+
+// =================================================================================================
+
+template <typename T, typename U>
+void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
+                  Queue &queue, const std::vector<std::string> &names) {
+  for (auto &name: names) {
+    if (name == kBufVecX) {buffers_host.x_vec = std::vector<T>(args.x_size, static_cast<T>(0)); buffers.x_vec.Read(queue, args.x_size, buffers_host.x_vec); }
+    else if (name == kBufVecY) { buffers_host.y_vec = std::vector<T>(args.y_size, static_cast<T>(0)); buffers.y_vec.Read(queue, args.y_size, buffers_host.y_vec); }
+    else if (name == kBufMatA) { buffers_host.a_mat = std::vector<T>(args.a_size, static_cast<T>(0)); buffers.a_mat.Read(queue, args.a_size, buffers_host.a_mat); }
+    else if (name == kBufMatB) { buffers_host.b_mat = std::vector<T>(args.b_size, static_cast<T>(0)); buffers.b_mat.Read(queue, args.b_size, buffers_host.b_mat); }
+    else if (name == kBufMatC) { buffers_host.c_mat = std::vector<T>(args.c_size, static_cast<T>(0)); buffers.c_mat.Read(queue, args.c_size, buffers_host.c_mat); }
+    else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector<T>(args.ap_size, static_cast<T>(0)); buffers.ap_mat.Read(queue, args.ap_size, buffers_host.ap_mat); }
+    else if (name == kBufScalar) { buffers_host.scalar = std::vector<T>(args.scalar_size, static_cast<T>(0)); buffers.scalar.Read(queue, args.scalar_size, buffers_host.scalar); }
+    else { throw std::runtime_error("Invalid buffer name"); }
+  }
+}
+
+template <typename T, typename U>
+void HostToDevice(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
+                  Queue &queue, const std::vector<std::string> &names) {
+  for (auto &name: names) {
+    if (name == kBufVecX) { buffers.x_vec.Write(queue, args.x_size, buffers_host.x_vec); }
+    else if (name == kBufVecY) { buffers.y_vec.Write(queue, args.y_size, buffers_host.y_vec); }
+    else if (name == kBufMatA) { buffers.a_mat.Write(queue, args.a_size, buffers_host.a_mat); }
+    else if (name == kBufMatB) { buffers.b_mat.Write(queue, args.b_size, buffers_host.b_mat); }
+    else if (name == kBufMatC) { buffers.c_mat.Write(queue, args.c_size, buffers_host.c_mat); }
+    else if (name == kBufMatAP) { buffers.ap_mat.Write(queue, args.ap_size, buffers_host.ap_mat); }
+    else if (name == kBufScalar) { buffers.scalar.Write(queue, args.scalar_size, buffers_host.scalar); }
+    else { throw std::runtime_error("Invalid buffer name"); }
+  }
+}
+
+// Compiles the above functions
+template void DeviceToHost(const Arguments<half>&, Buffers<half>&, BuffersHost<half>&, Queue&, const std::vector<std::string>&);
+template void DeviceToHost(const Arguments<float>&, Buffers<float>&, BuffersHost<float>&, Queue&, const std::vector<std::string>&);
+template void DeviceToHost(const Arguments<double>&, Buffers<double>&, BuffersHost<double>&, Queue&, const std::vector<std::string>&);
+template void DeviceToHost(const Arguments<float>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
+template void DeviceToHost(const Arguments<double>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
+template void DeviceToHost(const Arguments<float2>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
+template void DeviceToHost(const Arguments<double2>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<half>&, Buffers<half>&, BuffersHost<half>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<float>&, Buffers<float>&, BuffersHost<float>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<double>&, Buffers<double>&, BuffersHost<double>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<float>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<double>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<float2>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
+template void HostToDevice(const Arguments<double2>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
+
+// =================================================================================================
+
+// Conversion between half and single-precision
+std::vector<float> HalfToFloatBuffer(const std::vector<half>& source) {
+  auto result = std::vector<float>(source.size());
+  for (auto i = size_t(0); i < source.size(); ++i) { result[i] = HalfToFloat(source[i]); }
+  return result;
+}
+void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source) {
+  for (auto i = size_t(0); i < source.size(); ++i) { result[i] = FloatToHalf(source[i]); }
+}
+
+// As above, but now for OpenCL data-types instead of std::vectors
+Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw) {
+  const auto size = source.GetSize() / sizeof(half);
+  auto queue = Queue(queue_raw);
+  auto context = queue.GetContext();
+  auto source_cpu = std::vector<half>(size);
+  source.Read(queue, size, source_cpu);
+  auto result_cpu = HalfToFloatBuffer(source_cpu);
+  auto result = Buffer<float>(context, size);
+  result.Write(queue, size, result_cpu);
+  return result;
+}
+void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw) {
+  const auto size = source.GetSize() / sizeof(float);
+  auto queue = Queue(queue_raw);
+  auto context = queue.GetContext();
+  auto source_cpu = std::vector<float>(size);
+  source.Read(queue, size, source_cpu);
+  auto result_cpu = std::vector<half>(size);
+  FloatToHalfBuffer(result_cpu, source_cpu);
+  result.Write(queue, size, result_cpu);
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/test/test_utilities.hpp b/test/test_utilities.hpp
new file mode 100644
index 00000000..fc50a754
--- /dev/null
+++ b/test/test_utilities.hpp
@@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides declarations for the common test utility functions (performance clients and
+// correctness testers).
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_UTILITIES_H_
+#define CLBLAST_TEST_UTILITIES_H_
+
+#include <string>
+
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// The client-specific arguments in string form
+constexpr auto kArgCompareclblas = "clblas";
+constexpr auto kArgComparecblas = "cblas";
+constexpr auto kArgComparecublas = "cublas";
+constexpr auto kArgStepSize = "step";
+constexpr auto kArgNumSteps = "num_steps";
+constexpr auto kArgWarmUp = "warm_up";
+
+// The test-specific arguments in string form
+constexpr auto kArgFullTest = "full_test";
+constexpr auto kArgVerbose = "verbose";
+
+// =================================================================================================
+
+// Returns whether a scalar is close to zero
+template <typename T> bool IsCloseToZero(const T value);
+
+// =================================================================================================
+
+// Structure containing all possible buffers for test clients
+template <typename T>
+struct Buffers {
+  Buffer<T> x_vec;
+  Buffer<T> y_vec;
+  Buffer<T> a_mat;
+  Buffer<T> b_mat;
+  Buffer<T> c_mat;
+  Buffer<T> ap_mat;
+  Buffer<T> scalar;
+};
+template <typename T>
+struct BuffersHost {
+  std::vector<T> x_vec;
+  std::vector<T> y_vec;
+  std::vector<T> a_mat;
+  std::vector<T> b_mat;
+  std::vector<T> c_mat;
+  std::vector<T> ap_mat;
+  std::vector<T> scalar;
+};
+
+// =================================================================================================
+
+// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
+// data-types such as the Layout and Transpose data-types.
+template <typename T>
+std::string ToString(T value);
+
+// =================================================================================================
+
+// Copies buffers from the OpenCL device to the host
+template <typename T, typename U>
+void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
+                  Queue &queue, const std::vector<std::string> &names);
+
+// Copies buffers from the host to the OpenCL device
+template <typename T, typename U>
+void HostToDevice(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
+                  Queue &queue, const std::vector<std::string> &names);
+
+// =================================================================================================
+
+// Conversion between half and single-precision
+std::vector<float> HalfToFloatBuffer(const std::vector<half>& source);
+void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source);
+
+// As above, but now for OpenCL data-types instead of std::vectors
+Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw);
+void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_UTILITIES_H_
+#endif
author	Cedric Nugteren <web@cedricnugteren.nl>	2017-08-12 20:50:00 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2017-08-12 20:50:00 +0200
commit	777681dcbdf18493320dd7b94fccd5c6faee9455 (patch)
tree	b8597f5d79f8ef33bffbf33f3de2548cc51d4c5c
parent	97bcf77d4bc9b31e32a8785787e0497ac5440e44 (diff)
parent	d67fd6604b4a6584c4f9e856057fcc8076ce377d (diff)